refactoring MendeleySearch

It now processes each page of results before proceeding to the next one.
This way we shouldn't have any issue with this job sucking up all the
memory anymore, when there are a lot of results.
This commit is contained in:
Helge Rausch
2013-09-08 09:35:38 +02:00
parent 273a006d5b
commit 77b1521186
8 changed files with 224 additions and 91 deletions

View File

@@ -79,3 +79,6 @@ group :test do
gem 'simplecov', require: false
end
group :development, :test do
gem 'uuidtools'
end

View File

@@ -295,6 +295,7 @@ GEM
uglifier (2.1.2)
execjs (>= 0.3.0)
multi_json (~> 1.0, >= 1.0.2)
uuidtools (2.1.4)
vegas (0.1.11)
rack (>= 1.0.0)
warden (1.2.3)
@@ -351,6 +352,7 @@ DEPENDENCIES
therubyracer
twitter-bootstrap-rails
uglifier
uuidtools
vegas
will_paginate
yui-compressor

View File

@@ -1,7 +1,13 @@
class MendeleyPaper < ActiveRecord::Base
belongs_to :snp
validates_presence_of :title, :snp, :uuid
validates_uniqueness_of :uuid
searchable do
text :title
end
def first_author
read_attribute(:first_author).presence || "Unknown"
end
end

View File

@@ -2,7 +2,7 @@ class Snp < ActiveRecord::Base
has_many :user_snps, foreign_key: :snp_name, primary_key: :name,
dependent: :destroy
has_many :plos_paper
has_many :mendeley_paper
has_many :mendeley_paper, dependent: :destroy
has_many :snpedia_paper
has_many :snp_comments
has_many :genome_gov_paper
@@ -44,5 +44,4 @@ class Snp < ActiveRecord::Base
Sidekiq::Client.enqueue(Frequency,s.id)
end
end
end

View File

@@ -3,93 +3,84 @@ require "net/http"
require "json"
class MendeleySearch
include Sidekiq::Worker
sidekiq_options :queue => :mendeley, :retry => 5, :unique => true
include Sidekiq::Worker
attr_reader :snp
def perform(snp_id)
snp = Snp.find(snp_id)
if (snp.mendeley_updated.nil? || snp.mendeley_updated < 31.days.ago) &&
snp.name.index("vg").nil? && snp.name.index("mt-").nil?
page = 0
items = 500
documents = []
begin
begin
result = Mendeley::API::Documents.
search('"' + snp.name + '"', { items: items, page: page })
documents.concat(result['documents'])
puts result["total_pages"]
puts page
page += 1
rescue => e
puts e.class
puts e.message
puts "retrying..."
sleep 1
retry
end
sleep 1
end while result['total_pages'].to_i > 0 &&
result['total_pages'].to_i > result['current_page'].to_i
sidekiq_options :queue => :mendeley, :retry => 5, :unique => true
if result["error"].present?
puts "Mendeley API seems to be down."
puts "Error is:"
puts result["error"]
return
elsif documents.present?
puts "mendeley: Found #{documents.size} papers"
documents.each do |document|
uuid = document["uuid"].to_s
begin
first_author = document["authors"].first["forename"] + ' ' +
document["authors"].first["surname"]
rescue => e
puts "Something wrong in #{document["authors"]}: #{e.class}: #{e.message}"
first_author = "Unknown"
end
def perform(snp_id)
@snp = Snp.where(id: snp_id).first
if snp.nil?
logger.error("Snp(#{snp_id}) not found.")
return
end
if update_mendeley?
search
else
logger.info("mendeley papers for #{snp.name} do not need to be updated")
end
end
if MendeleyPaper.where(uuid: uuid).count == 0
puts "-> paper is new"
@mendeley_paper = MendeleyPaper.new(
snp_id: snp.id,
title: document['title'],
mendeley_url: document['mendeley_url'],
first_author: first_author,
pub_year: document['year'],
uuid: uuid
)
doi = document["doi"]
@mendeley_paper.doi = doi if doi.present?
@mendeley_paper.save
snp.ranking = snp.mendeley_paper.count +
2*snp.plos_paper.count + 5*snp.snpedia_paper.count +
2*snp.genome_gov_paper.count + 2*snp.pgp_annotation.count
puts "-> Written paper"
else
puts "-> paper is old"
@mendeley_paper = MendeleyPaper.find_by_uuid(uuid)
if @mendeley_paper.title == ""
puts "-> paper is broken and will be replaced now"
@mendeley_paper.update_attributes(
:title => document['title'],
:snp_id => snp.id,
:mendeley_url => document['mendeley_url'],
:first_author => first_author,
:pub_year => document['year']
)
end
end
Sidekiq::Client.enqueue(MendeleyDetails, @mendeley_paper.id)
end
else
puts "mendeley: No papers found"
end
snp.mendeley_updated = Time.zone.now
snp.save
else
puts "mendeley: time threshold not met"
end
end
def search
page = 0
items = 500
begin
result = Mendeley::API::Documents.
search("\"#{snp.name}\"", { items: items, page: page })
process_documents(result['documents'])
page += 1
sleep 1
end while result['total_pages'].to_i > 0 &&
result['total_pages'].to_i > result['current_page'].to_i
snp.mendeley_updated = Time.now
snp.ranking = snp.mendeley_paper.count +
2 * snp.plos_paper.count + 5 * snp.snpedia_paper.count +
2 * snp.genome_gov_paper.count + 2 * snp.pgp_annotation.count
snp.save or raise(
"could not save snp(#{snp.name}): #{snp.errors.full_messages.join(", ")}")
if result["error"].present?
logger.warn(
"Mendeley API seems to be down.\nError is: #{result["error"]}")
end
end
def process_documents(documents)
if documents.blank?
logger.info("mendeley: No papers found")
return
end
documents.each do |document|
uuid = document["uuid"].to_s
mendeley_paper = MendeleyPaper.find_or_initialize_by_uuid(uuid)
if mendeley_paper.new_record? || !mendeley_paper.valid?
first_author = document["authors"].first
if first_author.present?
first_author = "#{first_author["forename"]} #{first_author["surname"]}"
end
logger.info("creating or updating paper")
mendeley_paper.attributes = mendeley_paper.attributes.merge(
snp: snp,
title: document['title'],
mendeley_url: document['mendeley_url'],
first_author: first_author,
pub_year: document['year'],
uuid: uuid,
doi: document["doi"].presence,
)
if !(mendeley_paper.valid? && mendeley_paper.save)
logger.error("MendeleyPaper for #{snp.name} invalid.\n" <<
mendeley_paper.errors.full_messages.join(", "))
end
Sidekiq::Client.enqueue(MendeleyDetails, mendeley_paper.id)
end
end
end
def update_mendeley?
(snp.mendeley_updated.nil? || snp.mendeley_updated < 31.days.ago) &&
snp.name.index("vg").nil? && snp.name.index("mt-").nil?
end
end

View File

@@ -277,9 +277,9 @@ ActiveRecord::Schema.define(:version => 20130904010950) do
t.string "allele_frequency"
t.integer "ranking"
t.integer "number_of_users", :default => 0
t.datetime "mendeley_updated", :default => '2013-07-31 12:44:49'
t.datetime "plos_updated", :default => '2013-07-31 12:44:49'
t.datetime "snpedia_updated", :default => '2013-07-31 12:44:49'
t.datetime "mendeley_updated", :default => '2013-07-31 12:17:26'
t.datetime "plos_updated", :default => '2013-07-31 12:17:26'
t.datetime "snpedia_updated", :default => '2013-07-31 12:17:26'
t.datetime "created_at", :null => false
t.datetime "updated_at", :null => false
end

View File

@@ -83,4 +83,13 @@ FactoryGirl.define do
steps 100
floors 1
end
factory :mendeley_paper do
title "Musterstudie"
uuid { UUIDTools::UUID.random_create }
first_author "Max Mustermann"
mendeley_url "http://example.com"
doi "10.1000/182"
pub_year 2013
end
end

View File

@@ -0,0 +1,123 @@
require_relative '../test_helper'
class MendeleySearchTest < ActiveSupport::TestCase
context "worker" do
setup do
@snp = FactoryGirl.build_stubbed(:snp, id: 1)
@worker = MendeleySearch.new
@document = {
"uuid" => UUIDTools::UUID.random_create.to_s,
"title" => "Test Driven Development And Why You Should Do It",
"authors" => [{ "forename" => "Max", "surname" => "Mustermann" }],
"mendeley_url" => "http://example.com",
"year" => "2013",
"doi" => "456",
}
end
should "do nothing if snp does not exist" do
@worker.expects(:search).never
@worker.expects(:update_mendeley?).never
@worker.perform(0)
end
context "with existing snp" do
setup do
Snp.stubs(:where).returns(Snp)
Snp.stubs(:first).returns(@snp)
end
should "search for papers if the last update was too long ago" do
@worker.expects(:search)
@snp.stubs(:mendeley_updated).returns(32.days.ago)
@worker.perform(1)
end
should "not search for papers if the last update was not too long ago" do
@worker.expects(:search).never
@snp.stubs(:mendeley_updated).returns(30.days.ago)
@worker.perform(1)
end
should "search for papers if snp was never searched for" do
@worker.expects(:search)
@snp.stubs(:mendeley_updated).returns(nil)
@worker.perform(1)
end
end
context "searched-for papers" do
setup do
@worker.stubs(:snp).returns(@snp)
end
should "be processed" do
Mendeley::API::Documents.expects(:search).
with("\"#{@snp.name}\"", { items: 500, page: 0 }).
returns({ "documents" => [@document] })
@worker.expects(:process_documents).with([@document])
@snp.expects(:mendeley_updated=).with do |time|
assert time.is_a?(Time)
end
@snp.expects(:ranking=)
@snp.expects(:save).returns(true)
@worker.search
end
end
context "processing documents" do
setup do
@worker.stubs(:snp).returns(@snp)
end
should "create papers that do not already exist" do
uuid = @document["uuid"]
new_mendeley_paper = MendeleyPaper.new(uuid: uuid)
MendeleyPaper.expects(:find_or_initialize_by_uuid).with(uuid).
returns(new_mendeley_paper)
new_mendeley_paper.expects(:save).returns(true)
Sidekiq::Client.expects(:enqueue).with do |klass, id|
assert_equal(MendeleyDetails, klass)
end
@worker.process_documents([@document])
assert_equal @snp.id, new_mendeley_paper.snp_id
assert_equal @document["title"], new_mendeley_paper.title
assert_equal @document["mendeley_url"], new_mendeley_paper.mendeley_url
assert_equal "Max Mustermann", new_mendeley_paper.first_author
assert_equal @document["year"].to_i, new_mendeley_paper.pub_year
assert_equal @document["uuid"], new_mendeley_paper.uuid
assert_equal @document["doi"], new_mendeley_paper.doi
end
should "not update existing valid papers" do
uuid = @document["uuid"]
existing_mendeley_paper = FactoryGirl.
build_stubbed(:mendeley_paper, uuid: uuid, snp: @snp)
MendeleyPaper.expects(:find_or_initialize_by_uuid).with(uuid).
returns(existing_mendeley_paper)
MendeleyPaper.any_instance.expects(:save).never
Sidekiq::Client.expects(:enqueue).never
@worker.process_documents([@document])
end
should "update existing invalid papers" do
uuid = @document["uuid"]
existing_mendeley_paper = FactoryGirl.
build_stubbed(:mendeley_paper, snp: nil)
existing_mendeley_paper.expects(:save).returns(true)
MendeleyPaper.expects(:find_or_initialize_by_uuid).with(uuid).
returns(existing_mendeley_paper)
Sidekiq::Client.expects(:enqueue)
@worker.process_documents([@document])
assert_equal @snp.id, existing_mendeley_paper.snp_id
end
end
end
end