From b3ec41f8cb7df8fea0c56e86205069cfdcbf4ccb Mon Sep 17 00:00:00 2001 From: Helge Rausch Date: Sun, 8 Dec 2013 12:51:40 +0100 Subject: [PATCH] refactoring Plos(Search) --- Gemfile | 15 ++--- Gemfile.lock | 20 ++++++- app/controllers/snps_controller.rb | 2 +- app/models/snp.rb | 23 +++++++- app/workers/plos.rb | 93 ------------------------------ app/workers/plos_search.rb | 77 +++++++++++++++++++++++++ app/workers/plosdetails.rb | 54 +++++++++-------- test/data/plos_search_response.xml | 28 +++++++++ test/test_helper.rb | 3 + test/unit/plos_search_test.rb | 29 ++++++++++ test/unit/snp_test.rb | 6 +- 11 files changed, 218 insertions(+), 132 deletions(-) delete mode 100644 app/workers/plos.rb create mode 100644 app/workers/plos_search.rb create mode 100644 test/data/plos_search_response.xml create mode 100644 test/unit/plos_search_test.rb diff --git a/Gemfile b/Gemfile index 3075095..84a0f25 100644 --- a/Gemfile +++ b/Gemfile @@ -11,13 +11,14 @@ gem 'bcrypt-ruby', :require => "bcrypt" gem 'sanitize' gem "recaptcha", :require => "recaptcha/rails" gem 'dynamic_form' -gem 'capistrano' +gem 'capistrano', '~> 2.0' gem 'rvm-capistrano', '1.4.4' gem 'exceptional' # apis gem 'fitgem' -gem 'mendeley', git: 'git://github.com/tsujigiri/mendeley.git', branch: 'paging_search' +gem 'mendeley', github: 'tsujigiri/mendeley', branch: 'paging_search' +gem 'plos', github: 'tsujigiri/plos', branch: 'master', require: false # New Relic monitoring, off by default in development gem 'newrelic_rpm' @@ -37,13 +38,13 @@ gem 'sunspot_solr', '2.0.0' gem 'rubyzip', :git => 'git://github.com/rubyzip/rubyzip.git' gem "will_paginate" -gem 'nested_form', :git => 'git://github.com/ryanb/nested_form.git' +gem 'nested_form', github: 'ryanb/nested_form' gem 'json' gem 'mediawiki-gateway' gem 'activerecord-import', '~> 0.2.11' gem 'paperclip', '~> 3.0' -gem 'friendly_id', :git => 'git://github.com/FriendlyId/friendly_id.git', branch: '4.0-stable' # the branch is for Rails 3 -gem 'recommendify',:git => 'git://github.com/paulasmuth/recommendify.git', :ref => "34308c4" +gem 'friendly_id', github: 'FriendlyId/friendly_id', branch: '4.0-stable' # the branch is for Rails 3 +gem 'recommendify', github: 'paulasmuth/recommendify', :ref => "34308c4" # for jobs gem 'sidekiq' @@ -78,9 +79,9 @@ group :test do gem 'factory_girl' gem 'mocha', require: false gem 'debugger' - gem 'sunspot_test', git: 'git://github.com/tsujigiri/sunspot_test.git', branch: 'dirty_quickfix' - #gem "turn", "< 0.8.3" # truncates backtraces in the tests (bad) + gem 'sunspot_test', github: 'tsujigiri/sunspot_test', branch: 'dirty_quickfix' gem 'simplecov', require: false + gem 'webmock' end group :development, :test do diff --git a/Gemfile.lock b/Gemfile.lock index dc36599..7ea5a85 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -35,6 +35,15 @@ GIT rest-client yajl-ruby +GIT + remote: git://github.com/tsujigiri/plos.git + revision: de96b3a5d2868a89cecf6de5987fb204f0f1e6dc + branch: master + specs: + plos (0.0.6) + nokogiri + rest-client + GIT remote: git://github.com/tsujigiri/sunspot_test.git revision: eb70e7cb2a7bc9b57ac892c901cac47a6ab04ad6 @@ -88,6 +97,7 @@ GEM activesupport (3.2.16) i18n (~> 0.6, >= 0.6.4) multi_json (~> 1.0) + addressable (2.3.5) arbre (1.0.1) activesupport (>= 3.0.0) arel (3.0.3) @@ -118,6 +128,8 @@ GEM coffee-script-source (1.6.3) columnize (0.3.6) connection_pool (1.1.0) + crack (0.4.1) + safe_yaml (~> 0.9.0) debugger (1.6.2) columnize (>= 0.3.1) debugger-linecache (~> 1.2.0) @@ -244,6 +256,7 @@ GEM builder (>= 2.1.2) rvm-capistrano (1.4.4) capistrano (>= 2.15.4) + safe_yaml (0.9.7) sanitize (2.0.6) nokogiri (>= 1.4.4) sass (3.2.12) @@ -307,6 +320,9 @@ GEM rack (>= 1.0.0) warden (1.2.3) rack (>= 1.0) + webmock (1.16.0) + addressable (>= 2.2.7) + crack (>= 0.3.2) will_paginate (3.0.5) yajl-ruby (1.1.0) yui-compressor (0.12.0) @@ -320,7 +336,7 @@ DEPENDENCIES authlogic bartt-ssl_requirement (~> 1.4.0) bcrypt-ruby - capistrano + capistrano (~> 2.0) coffee-script debugger devise (= 3.0.0) @@ -343,6 +359,7 @@ DEPENDENCIES newrelic_rpm paperclip (~> 3.0) pg + plos! rails (~> 3.2.16) rails3-generators recaptcha @@ -365,5 +382,6 @@ DEPENDENCIES uglifier uuidtools vegas + webmock will_paginate yui-compressor diff --git a/app/controllers/snps_controller.rb b/app/controllers/snps_controller.rb index 35f86e4..ffa25c1 100644 --- a/app/controllers/snps_controller.rb +++ b/app/controllers/snps_controller.rb @@ -29,7 +29,7 @@ class SnpsController < ApplicationController @total_genotypes = @snp.genotype_frequency.map {|k,v| v }.sum @total_alleles = @snp.allele_frequency.map {|k,v| v }.sum - Sidekiq::Client.enqueue(Plos, @snp.id) + Sidekiq::Client.enqueue(PlosSearch, @snp.id) Sidekiq::Client.enqueue(MendeleySearch, @snp.id) Sidekiq::Client.enqueue(Snpedia, @snp.id) diff --git a/app/models/snp.rb b/app/models/snp.rb index 7aa7f75..c550bc1 100644 --- a/app/models/snp.rb +++ b/app/models/snp.rb @@ -33,9 +33,9 @@ class Snp < ActiveRecord::Base snps = Snp.select([ :id, :mendeley_updated, :snpedia_updated, :plos_updated ]). where([ 'mendeley_updated < ? or snpedia_updated < ? or plos_updated < ?', max_age, max_age, max_age ]).find_each do |snp| - Sidekiq::Client.enqueue(Mendeley, snp.id) if snp.mendeley_updated < max_age - Sidekiq::Client.enqueue(Snpedia, snp.id) if snp.snpedia_updated < max_age - Sidekiq::Client.enqueue(Plos, snp.id) if snp.plos_updated < max_age + Sidekiq::Client.enqueue(Mendeley, snp.id) if snp.mendeley_updated < max_age + Sidekiq::Client.enqueue(Snpedia, snp.id) if snp.snpedia_updated < max_age + Sidekiq::Client.enqueue(PlosSearch, snp.id) if snp.plos_updated < max_age end end @@ -44,4 +44,21 @@ class Snp < ActiveRecord::Base Sidekiq::Client.enqueue(Frequency,s.id) end end + + # TODO: move to after hook, checking whether one of the *_updated attributes + # has changed and updating the ranking if so. + def update_ranking + self.ranking = + mendeley_paper.count + + 2 * plos_paper.count + + 5 * snpedia_paper.count + + 2 * genome_gov_paper.count + + 2 * pgp_annotation.count + end + + def plos_updated! + self.plos_updated = Time.current + update_ranking + save + end end diff --git a/app/workers/plos.rb b/app/workers/plos.rb deleted file mode 100644 index 55e2556..0000000 --- a/app/workers/plos.rb +++ /dev/null @@ -1,93 +0,0 @@ - -require 'net/http' -require 'rexml/document' - -class Plos - include Sidekiq::Worker - sidekiq_options :queue => :plos, :retry => 5, :unique => true - - def is_illegal_snp(name) - # we don't need mitochondrial or VG-SNPs as these just result in noise - # from the PLOS API - forbidden_names = ["mt-", "vg"] - if forbidden_names.any? { |part| name[part] } - log "plos: Snp #{name} is a mitochondrial or vg snp" - return true - end - end - - def is_old_snp(snp) - # we don't need to update snps that have been updated in the last month - if snp.plos_updated > 31.days.ago - log "plos: time threshold for #{snp.name} not met" - return true - end - end - - def perform(snp_id) - # Logging stuff - Rails.logger.level = 0 - Rails.logger = Logger.new("#{Rails.root}/log/plos_#{Rails.env}.log") - - # Get SNP - @snp = Snp.find(snp_id) - - return false if is_illegal_snp(@snp.name) or is_old_snp(@snp) - - key_handle = File.open(::Rails.root.to_s+"/key_plos.txt") - api_key = key_handle.readline.rstrip - - url = "http://api.plos.org/search?q="+@snp.name+"&api_key="+api_key - - begin - xml_data = Net::HTTP.get_response(URI.parse(url)).body - rescue # yep, this sucks, but the http-parser likes to break down without any reason, so this retries... - retry - end - - doc = REXML::Document.new(xml_data) - - all_elements = doc.elements.to_a - # check if there are any papers to add... - if all_elements[0][2].index("numFound='0'") != -1 - log "plos: none found\n" - return false - end - - log "plos: got papers" - log "all elements: #{all_elements}" - log "Checking: #{all_elements[0][2]}" - all_elements[0][2].each do |singleton| - log "plos: Looking at: #{singleton}" - log "Trying #{singleton[7]}" - first_author = singleton[2][0].to_s.gsub!(/<\/?str>/,"") - log "first author: #{first_author}" - doi = singleton[4].to_s.gsub!(/<\/?str( name='id')?>/,"") - log "doi: #{doi}" - pub_date = singleton[6].to_s.gsub!(/<\/?date( name='publication_date')?>/,"") - log "pub_date: #{pub_date}" - title = CGI.unescapeHTML(singleton[7].to_s.gsub!(/<\/?str( name='title_display')?>/,"")) - log "full title: #{title}" - - if PlosPaper.find_all_by_doi(doi) == [] - @plos_paper = PlosPaper.new(:first_author => first_author, :doi => doi, :title => title, :pub_date => pub_date, :snp_id => @snp.id) - @plos_paper.save - log "-> written new paper\n" - @snp.ranking = @snp.mendeley_paper.count + 2*@snp.plos_paper.count + 5*@snp.snpedia_paper.count + 2*@snp.genome_gov_paper.count + 2*@snp.pgp_annotation.count - else - log "-> paper is old" - @plos_paper = PlosPaper.find_by_doi(doi) - end - Sidekiq::Client.enqueue(PlosDetails,@plos_paper) - end - @snp.plos_updated = Time.zone.now - @snp.save - log "plos: sleep for 5 secs\n" - sleep(5) - end - - def log msg - Rails.logger.info "#{DateTime.now}: #{msg}" - end -end - diff --git a/app/workers/plos_search.rb b/app/workers/plos_search.rb new file mode 100644 index 0000000..451229a --- /dev/null +++ b/app/workers/plos_search.rb @@ -0,0 +1,77 @@ +# See http://api.plos.org/solr/faq/#solr_api_recommended_usage for API limits +require 'plos' + +class PlosSearch + include Sidekiq::Worker + sidekiq_options queue: :plos, unique: true + attr_reader :snp, :client + + def perform(snp_id) + setup_logger + @snp = Snp.where(id: snp_id).first + return false if @snp.nil? || snp_illegal? || recently_updated? + @client = PLOS::Client.new(self.class.api_key) + articles = perform_search + articles.each do |article| + import_article(article) + end + snp.plos_updated! + sleep(6) # honoring API limits + end + + def import_article(article) + plos_paper_attributes = { + first_author: article.authors.first.to_s, + doi: article.id, + pub_date: article.published_at, + title: article.title, + snp_id: snp.id, + } + plos_paper = PlosPaper.find_or_initialize_by_doi(plos_paper_attributes[:doi]) + plos_paper.update_attributes!(plos_paper_attributes) + Sidekiq::Client.enqueue(PlosDetails, plos_paper.id) + end + + def perform_search + # honoring API limits + Timeout.timeout(5) do + client.search(snp.name) + end + end + + def snp_illegal? + # we don't need mitochondrial or VG-SNPs as these just result in noise + # from the PLOS API + forbidden_names = ["mt-", "vg"] + if forbidden_names.any? { |part| snp.name[part] } + log "Snp #{snp.name} is a mitochondrial or vg snp" + true + else + false + end + end + + def recently_updated? + # we don't need to update snps that have been updated in the last month + if snp.plos_updated > 31.days.ago + log "time threshold for #{snp.name} not met" + true + else + false + end + end + + def setup_logger + Rails.logger = Logger.new(Rails.root.join("log/plos_#{Rails.env}.log")) + Rails.logger.level = 0 + end + + def log(msg) + Rails.logger.info "#{DateTime.now}: #{msg}" + end + + def self.api_key + # TODO: put in APP_CONFIG + File.read(Rails.root.join("key_plos.txt")).strip + end +end diff --git a/app/workers/plosdetails.rb b/app/workers/plosdetails.rb index 9a4d87a..650ae08 100644 --- a/app/workers/plosdetails.rb +++ b/app/workers/plosdetails.rb @@ -3,32 +3,38 @@ require "net/http" require "json" class PlosDetails - include Sidekiq::Worker - sidekiq_options :queue => :plos_details, :retry => 5, :unique => true + include Sidekiq::Worker + sidekiq_options :queue => :plos_details, :retry => 5, :unique => true - def perform(plos_paper) - @Plos_paper = PlosPaper.find_by_id(plos_paper["plos_paper"]["id"].to_i) - - key_handle = File.open(::Rails.root.to_s+"/key_plos.txt") - api_key = key_handle.readline.rstrip - - detail_url = "http://alm.plos.org/articles/" + @Plos_paper.doi + ".json?api_key="+api_key - begin - detail_resp = Net::HTTP.get_response(URI.parse(detail_url)) - rescue - retry + def perform(plos_paper) + plos_paper_id = + if plos_paper.is_a?(Hash) + plos_paper["plos_paper"]["id"].to_i + else + plos_paper.to_i end - detail_data = detail_resp.body - detail_result = JSON.parse(detail_data) + plos_paper = PlosPaper.find_by_id(plos_paper_id) + key_handle = File.open(::Rails.root.to_s+"/key_plos.txt") + api_key = key_handle.readline.rstrip - print "plos details: updated reader-status\n" - print detail_result - readers_total = detail_result["article"]["events_count"].to_i - - @Plos_paper.reader = readers_total.to_i - @Plos_paper.save - print "-> sleep for 5 secs\n\n" - sleep(5) - end + detail_url = "http://alm.plos.org/articles/" + plos_paper.doi + ".json?api_key="+api_key + begin + detail_resp = Net::HTTP.get_response(URI.parse(detail_url)) + rescue + retry + end + + detail_data = detail_resp.body + detail_result = JSON.parse(detail_data) + + print "plos details: updated reader-status\n" + print detail_result + readers_total = detail_result["article"]["events_count"].to_i + + plos_paper.reader = readers_total.to_i + plos_paper.save + print "-> sleep for 6 secs\n\n" + sleep(6) + end end diff --git a/test/data/plos_search_response.xml b/test/data/plos_search_response.xml new file mode 100644 index 0000000..7b44c7e --- /dev/null +++ b/test/data/plos_search_response.xml @@ -0,0 +1,28 @@ + + + + + 10.1371/journal.pone.0013771 + PLoS ONE + 1932-6203 + 2010-10-29T00:00:00Z + Research Article + + Ester Aparicio + Mariona Parera + Sandra Franco + Nuria Pérez-Alvarez + Cristina Tural + Bonaventura Clotet + Miguel Angel Martínez + + + + Recent genome-wide association studies report that the SNP rs8099917, located 8.9 kb upstream of the start codon of IL28B, is associated with both disease chronicity and therapeutic response to pegIFN-α and RBV in patients infected with genotype 1 HCV. To determine the effect of rs8099917 variation on the response of HCV to therapy, we genotyped this variant in a cohort of 160 HCV/HIV-1 coinfected patients in our clinic unit who received combined peg-IFN-α/RBV therapy. The rs8099917 T/G or G/G genotypes were observed in 56 patients (35%). Treatment failure occurred in 80% of G-allele carriers versus 48% of non-carriers (P<0.0001). This result reveals that the G allele was strongly associated with treatment failure in this patient cohort. Importantly, a highly significant association was found between the G-allele and response to therapy in HCV genotype 1-infected patients (P<0.0001) but not in HCV genotype 3-infected patients. Multivariate analysis (odds ratio; 95% confidence interval; P value) indicated that the rs8099917 TT genotype was a strong predictor of treatment success (5.83; 1.26–26.92; P = 0.021), independent of baseline plasma HCV-RNA load less than 500 000 IU/ml (4.85; 1.18–19.95; P = 0.025) and absence of advanced liver fibrosis (5.24; 1.20–22.91; P = 0.025). These results reveal the high prevalence of the rs8099917 G allele in HCV/HIV-1 coinfected patients as well as its strong association with treatment failure in HCV genotype 1-infected patients. rs8099917 SNP genotyping may be a valid pre-treatment predictor of which patients are likely to respond to treatment in this group of difficult-to-treat HCV/HIV-infected patients. + + + IL28B SNP rs8099917 Is Strongly Associated with Pegylated Interferon-α and Ribavirin Therapy Treatment Failure in HCV/HIV-1 Coinfected Patients + 1.2268105 + + + diff --git a/test/test_helper.rb b/test/test_helper.rb index d568970..cdaccb7 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -7,11 +7,14 @@ require "shoulda-context" require "mocha/setup" require 'rails/test_help' require "authlogic/test_case" +require 'webmock/test_unit' +WebMock.disable_net_connect!(:allow_localhost => true) SunspotTest.solr_startup_timeout = 30 require 'sunspot_test/test_unit' require 'factory_girl' FactoryGirl.find_definitions require 'paperclip/matchers' +require 'plos' class ActiveSupport::TestCase extend Paperclip::Shoulda::Matchers diff --git a/test/unit/plos_search_test.rb b/test/unit/plos_search_test.rb new file mode 100644 index 0000000..0abbcdf --- /dev/null +++ b/test/unit/plos_search_test.rb @@ -0,0 +1,29 @@ +require_relative '../test_helper' + +class PlosSearchTest < ActiveSupport::TestCase + context "worker" do + setup do + @snp = FactoryGirl.create(:snp) + end + + should "associate new paper with snp" do + response = File.read(Rails.root.join('test/data/plos_search_response.xml')) + stub_request(:post, "api.plos.org/search"). + with(body: { 'api_key' => 'xxx', 'q' => @snp.name, 'rows' => '50', 'start' => '0' }). + to_return(status: 200, body: response) + PlosSearch.stubs(:api_key).returns('xxx') + Sidekiq::Client.expects(:enqueue).with(PlosDetails, instance_of(Fixnum)) + assert_difference(-> { PlosPaper.count }) do + PlosSearch.new.perform(@snp.id) + end + @snp.reload + assert @snp.plos_updated + plos_paper = PlosPaper.last + assert_equal @snp, plos_paper.snp + assert_equal 'Ester Aparicio', plos_paper.first_author + assert_equal '10.1371/journal.pone.0013771', plos_paper.doi + assert_equal DateTime.new(2010, 10, 29), plos_paper.pub_date + assert_match 'rs8099917', plos_paper.title + end + end +end diff --git a/test/unit/snp_test.rb b/test/unit/snp_test.rb index 3c93832..ae0b936 100644 --- a/test/unit/snp_test.rb +++ b/test/unit/snp_test.rb @@ -11,9 +11,9 @@ class SnpTest < ActiveSupport::TestCase @snp.mendeley_updated = @snp.snpedia_updated = @snp.plos_updated = 32.days.ago @snp.save queue = sequence('queue') - Sidekiq::Client.expects(:enqueue).with(Mendeley, @snp.id).in_sequence(queue) - Sidekiq::Client.expects(:enqueue).with(Snpedia, @snp.id).in_sequence(queue) - Sidekiq::Client.expects(:enqueue).with(Plos, @snp.id).in_sequence(queue) + Sidekiq::Client.expects(:enqueue).with(Mendeley, @snp.id).in_sequence(queue) + Sidekiq::Client.expects(:enqueue).with(Snpedia, @snp.id).in_sequence(queue) + Sidekiq::Client.expects(:enqueue).with(PlosSearch, @snp.id).in_sequence(queue) Snp.update_papers end