mirror of
https://github.com/chenasraf/snpr.git
synced 2026-05-18 01:39:01 +00:00
refactoring MendeleySearch
It now processes each page of results before proceeding to the next one. This way we shouldn't have any issue with this job sucking up all the memory anymore, when there are a lot of results.
This commit is contained in:
3
Gemfile
3
Gemfile
@@ -79,3 +79,6 @@ group :test do
|
||||
gem 'simplecov', require: false
|
||||
end
|
||||
|
||||
group :development, :test do
|
||||
gem 'uuidtools'
|
||||
end
|
||||
|
||||
@@ -295,6 +295,7 @@ GEM
|
||||
uglifier (2.1.2)
|
||||
execjs (>= 0.3.0)
|
||||
multi_json (~> 1.0, >= 1.0.2)
|
||||
uuidtools (2.1.4)
|
||||
vegas (0.1.11)
|
||||
rack (>= 1.0.0)
|
||||
warden (1.2.3)
|
||||
@@ -351,6 +352,7 @@ DEPENDENCIES
|
||||
therubyracer
|
||||
twitter-bootstrap-rails
|
||||
uglifier
|
||||
uuidtools
|
||||
vegas
|
||||
will_paginate
|
||||
yui-compressor
|
||||
|
||||
@@ -1,7 +1,13 @@
|
||||
class MendeleyPaper < ActiveRecord::Base
|
||||
belongs_to :snp
|
||||
validates_presence_of :title, :snp, :uuid
|
||||
validates_uniqueness_of :uuid
|
||||
|
||||
searchable do
|
||||
text :title
|
||||
end
|
||||
|
||||
def first_author
|
||||
read_attribute(:first_author).presence || "Unknown"
|
||||
end
|
||||
end
|
||||
|
||||
@@ -2,7 +2,7 @@ class Snp < ActiveRecord::Base
|
||||
has_many :user_snps, foreign_key: :snp_name, primary_key: :name,
|
||||
dependent: :destroy
|
||||
has_many :plos_paper
|
||||
has_many :mendeley_paper
|
||||
has_many :mendeley_paper, dependent: :destroy
|
||||
has_many :snpedia_paper
|
||||
has_many :snp_comments
|
||||
has_many :genome_gov_paper
|
||||
@@ -44,5 +44,4 @@ class Snp < ActiveRecord::Base
|
||||
Sidekiq::Client.enqueue(Frequency,s.id)
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
@@ -3,93 +3,84 @@ require "net/http"
|
||||
require "json"
|
||||
|
||||
class MendeleySearch
|
||||
include Sidekiq::Worker
|
||||
sidekiq_options :queue => :mendeley, :retry => 5, :unique => true
|
||||
include Sidekiq::Worker
|
||||
attr_reader :snp
|
||||
|
||||
def perform(snp_id)
|
||||
snp = Snp.find(snp_id)
|
||||
if (snp.mendeley_updated.nil? || snp.mendeley_updated < 31.days.ago) &&
|
||||
snp.name.index("vg").nil? && snp.name.index("mt-").nil?
|
||||
page = 0
|
||||
items = 500
|
||||
documents = []
|
||||
begin
|
||||
begin
|
||||
result = Mendeley::API::Documents.
|
||||
search('"' + snp.name + '"', { items: items, page: page })
|
||||
documents.concat(result['documents'])
|
||||
puts result["total_pages"]
|
||||
puts page
|
||||
page += 1
|
||||
rescue => e
|
||||
puts e.class
|
||||
puts e.message
|
||||
puts "retrying..."
|
||||
sleep 1
|
||||
retry
|
||||
end
|
||||
sleep 1
|
||||
end while result['total_pages'].to_i > 0 &&
|
||||
result['total_pages'].to_i > result['current_page'].to_i
|
||||
sidekiq_options :queue => :mendeley, :retry => 5, :unique => true
|
||||
|
||||
if result["error"].present?
|
||||
puts "Mendeley API seems to be down."
|
||||
puts "Error is:"
|
||||
puts result["error"]
|
||||
return
|
||||
elsif documents.present?
|
||||
puts "mendeley: Found #{documents.size} papers"
|
||||
documents.each do |document|
|
||||
uuid = document["uuid"].to_s
|
||||
begin
|
||||
first_author = document["authors"].first["forename"] + ' ' +
|
||||
document["authors"].first["surname"]
|
||||
rescue => e
|
||||
puts "Something wrong in #{document["authors"]}: #{e.class}: #{e.message}"
|
||||
first_author = "Unknown"
|
||||
end
|
||||
def perform(snp_id)
|
||||
@snp = Snp.where(id: snp_id).first
|
||||
if snp.nil?
|
||||
logger.error("Snp(#{snp_id}) not found.")
|
||||
return
|
||||
end
|
||||
if update_mendeley?
|
||||
search
|
||||
else
|
||||
logger.info("mendeley papers for #{snp.name} do not need to be updated")
|
||||
end
|
||||
end
|
||||
|
||||
if MendeleyPaper.where(uuid: uuid).count == 0
|
||||
puts "-> paper is new"
|
||||
@mendeley_paper = MendeleyPaper.new(
|
||||
snp_id: snp.id,
|
||||
title: document['title'],
|
||||
mendeley_url: document['mendeley_url'],
|
||||
first_author: first_author,
|
||||
pub_year: document['year'],
|
||||
uuid: uuid
|
||||
)
|
||||
doi = document["doi"]
|
||||
@mendeley_paper.doi = doi if doi.present?
|
||||
@mendeley_paper.save
|
||||
snp.ranking = snp.mendeley_paper.count +
|
||||
2*snp.plos_paper.count + 5*snp.snpedia_paper.count +
|
||||
2*snp.genome_gov_paper.count + 2*snp.pgp_annotation.count
|
||||
|
||||
puts "-> Written paper"
|
||||
else
|
||||
puts "-> paper is old"
|
||||
@mendeley_paper = MendeleyPaper.find_by_uuid(uuid)
|
||||
if @mendeley_paper.title == ""
|
||||
puts "-> paper is broken and will be replaced now"
|
||||
@mendeley_paper.update_attributes(
|
||||
:title => document['title'],
|
||||
:snp_id => snp.id,
|
||||
:mendeley_url => document['mendeley_url'],
|
||||
:first_author => first_author,
|
||||
:pub_year => document['year']
|
||||
)
|
||||
end
|
||||
end
|
||||
Sidekiq::Client.enqueue(MendeleyDetails, @mendeley_paper.id)
|
||||
end
|
||||
else
|
||||
puts "mendeley: No papers found"
|
||||
end
|
||||
snp.mendeley_updated = Time.zone.now
|
||||
snp.save
|
||||
else
|
||||
puts "mendeley: time threshold not met"
|
||||
end
|
||||
end
|
||||
def search
|
||||
page = 0
|
||||
items = 500
|
||||
begin
|
||||
result = Mendeley::API::Documents.
|
||||
search("\"#{snp.name}\"", { items: items, page: page })
|
||||
process_documents(result['documents'])
|
||||
page += 1
|
||||
sleep 1
|
||||
end while result['total_pages'].to_i > 0 &&
|
||||
result['total_pages'].to_i > result['current_page'].to_i
|
||||
|
||||
snp.mendeley_updated = Time.now
|
||||
snp.ranking = snp.mendeley_paper.count +
|
||||
2 * snp.plos_paper.count + 5 * snp.snpedia_paper.count +
|
||||
2 * snp.genome_gov_paper.count + 2 * snp.pgp_annotation.count
|
||||
snp.save or raise(
|
||||
"could not save snp(#{snp.name}): #{snp.errors.full_messages.join(", ")}")
|
||||
|
||||
if result["error"].present?
|
||||
logger.warn(
|
||||
"Mendeley API seems to be down.\nError is: #{result["error"]}")
|
||||
end
|
||||
end
|
||||
|
||||
def process_documents(documents)
|
||||
if documents.blank?
|
||||
logger.info("mendeley: No papers found")
|
||||
return
|
||||
end
|
||||
documents.each do |document|
|
||||
uuid = document["uuid"].to_s
|
||||
mendeley_paper = MendeleyPaper.find_or_initialize_by_uuid(uuid)
|
||||
if mendeley_paper.new_record? || !mendeley_paper.valid?
|
||||
first_author = document["authors"].first
|
||||
if first_author.present?
|
||||
first_author = "#{first_author["forename"]} #{first_author["surname"]}"
|
||||
end
|
||||
|
||||
logger.info("creating or updating paper")
|
||||
mendeley_paper.attributes = mendeley_paper.attributes.merge(
|
||||
snp: snp,
|
||||
title: document['title'],
|
||||
mendeley_url: document['mendeley_url'],
|
||||
first_author: first_author,
|
||||
pub_year: document['year'],
|
||||
uuid: uuid,
|
||||
doi: document["doi"].presence,
|
||||
)
|
||||
if !(mendeley_paper.valid? && mendeley_paper.save)
|
||||
logger.error("MendeleyPaper for #{snp.name} invalid.\n" <<
|
||||
mendeley_paper.errors.full_messages.join(", "))
|
||||
end
|
||||
Sidekiq::Client.enqueue(MendeleyDetails, mendeley_paper.id)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def update_mendeley?
|
||||
(snp.mendeley_updated.nil? || snp.mendeley_updated < 31.days.ago) &&
|
||||
snp.name.index("vg").nil? && snp.name.index("mt-").nil?
|
||||
end
|
||||
end
|
||||
|
||||
@@ -277,9 +277,9 @@ ActiveRecord::Schema.define(:version => 20130904010950) do
|
||||
t.string "allele_frequency"
|
||||
t.integer "ranking"
|
||||
t.integer "number_of_users", :default => 0
|
||||
t.datetime "mendeley_updated", :default => '2013-07-31 12:44:49'
|
||||
t.datetime "plos_updated", :default => '2013-07-31 12:44:49'
|
||||
t.datetime "snpedia_updated", :default => '2013-07-31 12:44:49'
|
||||
t.datetime "mendeley_updated", :default => '2013-07-31 12:17:26'
|
||||
t.datetime "plos_updated", :default => '2013-07-31 12:17:26'
|
||||
t.datetime "snpedia_updated", :default => '2013-07-31 12:17:26'
|
||||
t.datetime "created_at", :null => false
|
||||
t.datetime "updated_at", :null => false
|
||||
end
|
||||
|
||||
@@ -83,4 +83,13 @@ FactoryGirl.define do
|
||||
steps 100
|
||||
floors 1
|
||||
end
|
||||
|
||||
factory :mendeley_paper do
|
||||
title "Musterstudie"
|
||||
uuid { UUIDTools::UUID.random_create }
|
||||
first_author "Max Mustermann"
|
||||
mendeley_url "http://example.com"
|
||||
doi "10.1000/182"
|
||||
pub_year 2013
|
||||
end
|
||||
end
|
||||
|
||||
123
test/unit/mendeley_search_test.rb
Normal file
123
test/unit/mendeley_search_test.rb
Normal file
@@ -0,0 +1,123 @@
|
||||
require_relative '../test_helper'
|
||||
|
||||
class MendeleySearchTest < ActiveSupport::TestCase
|
||||
context "worker" do
|
||||
setup do
|
||||
@snp = FactoryGirl.build_stubbed(:snp, id: 1)
|
||||
@worker = MendeleySearch.new
|
||||
@document = {
|
||||
"uuid" => UUIDTools::UUID.random_create.to_s,
|
||||
"title" => "Test Driven Development And Why You Should Do It",
|
||||
"authors" => [{ "forename" => "Max", "surname" => "Mustermann" }],
|
||||
"mendeley_url" => "http://example.com",
|
||||
"year" => "2013",
|
||||
"doi" => "456",
|
||||
}
|
||||
end
|
||||
|
||||
should "do nothing if snp does not exist" do
|
||||
@worker.expects(:search).never
|
||||
@worker.expects(:update_mendeley?).never
|
||||
@worker.perform(0)
|
||||
end
|
||||
|
||||
context "with existing snp" do
|
||||
setup do
|
||||
Snp.stubs(:where).returns(Snp)
|
||||
Snp.stubs(:first).returns(@snp)
|
||||
end
|
||||
|
||||
should "search for papers if the last update was too long ago" do
|
||||
@worker.expects(:search)
|
||||
@snp.stubs(:mendeley_updated).returns(32.days.ago)
|
||||
@worker.perform(1)
|
||||
end
|
||||
|
||||
should "not search for papers if the last update was not too long ago" do
|
||||
@worker.expects(:search).never
|
||||
@snp.stubs(:mendeley_updated).returns(30.days.ago)
|
||||
@worker.perform(1)
|
||||
end
|
||||
|
||||
should "search for papers if snp was never searched for" do
|
||||
@worker.expects(:search)
|
||||
@snp.stubs(:mendeley_updated).returns(nil)
|
||||
@worker.perform(1)
|
||||
end
|
||||
end
|
||||
|
||||
context "searched-for papers" do
|
||||
setup do
|
||||
@worker.stubs(:snp).returns(@snp)
|
||||
end
|
||||
|
||||
should "be processed" do
|
||||
Mendeley::API::Documents.expects(:search).
|
||||
with("\"#{@snp.name}\"", { items: 500, page: 0 }).
|
||||
returns({ "documents" => [@document] })
|
||||
@worker.expects(:process_documents).with([@document])
|
||||
|
||||
@snp.expects(:mendeley_updated=).with do |time|
|
||||
assert time.is_a?(Time)
|
||||
end
|
||||
@snp.expects(:ranking=)
|
||||
@snp.expects(:save).returns(true)
|
||||
|
||||
@worker.search
|
||||
end
|
||||
end
|
||||
|
||||
context "processing documents" do
|
||||
setup do
|
||||
@worker.stubs(:snp).returns(@snp)
|
||||
end
|
||||
|
||||
should "create papers that do not already exist" do
|
||||
uuid = @document["uuid"]
|
||||
new_mendeley_paper = MendeleyPaper.new(uuid: uuid)
|
||||
MendeleyPaper.expects(:find_or_initialize_by_uuid).with(uuid).
|
||||
returns(new_mendeley_paper)
|
||||
new_mendeley_paper.expects(:save).returns(true)
|
||||
Sidekiq::Client.expects(:enqueue).with do |klass, id|
|
||||
assert_equal(MendeleyDetails, klass)
|
||||
end
|
||||
|
||||
@worker.process_documents([@document])
|
||||
|
||||
assert_equal @snp.id, new_mendeley_paper.snp_id
|
||||
assert_equal @document["title"], new_mendeley_paper.title
|
||||
assert_equal @document["mendeley_url"], new_mendeley_paper.mendeley_url
|
||||
assert_equal "Max Mustermann", new_mendeley_paper.first_author
|
||||
assert_equal @document["year"].to_i, new_mendeley_paper.pub_year
|
||||
assert_equal @document["uuid"], new_mendeley_paper.uuid
|
||||
assert_equal @document["doi"], new_mendeley_paper.doi
|
||||
end
|
||||
|
||||
should "not update existing valid papers" do
|
||||
uuid = @document["uuid"]
|
||||
existing_mendeley_paper = FactoryGirl.
|
||||
build_stubbed(:mendeley_paper, uuid: uuid, snp: @snp)
|
||||
MendeleyPaper.expects(:find_or_initialize_by_uuid).with(uuid).
|
||||
returns(existing_mendeley_paper)
|
||||
MendeleyPaper.any_instance.expects(:save).never
|
||||
Sidekiq::Client.expects(:enqueue).never
|
||||
|
||||
@worker.process_documents([@document])
|
||||
end
|
||||
|
||||
should "update existing invalid papers" do
|
||||
uuid = @document["uuid"]
|
||||
existing_mendeley_paper = FactoryGirl.
|
||||
build_stubbed(:mendeley_paper, snp: nil)
|
||||
existing_mendeley_paper.expects(:save).returns(true)
|
||||
MendeleyPaper.expects(:find_or_initialize_by_uuid).with(uuid).
|
||||
returns(existing_mendeley_paper)
|
||||
Sidekiq::Client.expects(:enqueue)
|
||||
|
||||
@worker.process_documents([@document])
|
||||
|
||||
assert_equal @snp.id, existing_mendeley_paper.snp_id
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user