refactoring Plos(Search)

This commit is contained in:
Helge Rausch
2013-12-08 12:51:40 +01:00
parent 207b96e5f5
commit b3ec41f8cb
11 changed files with 218 additions and 132 deletions

15
Gemfile
View File

@@ -11,13 +11,14 @@ gem 'bcrypt-ruby', :require => "bcrypt"
gem 'sanitize'
gem "recaptcha", :require => "recaptcha/rails"
gem 'dynamic_form'
gem 'capistrano'
gem 'capistrano', '~> 2.0'
gem 'rvm-capistrano', '1.4.4'
gem 'exceptional'
# apis
gem 'fitgem'
gem 'mendeley', git: 'git://github.com/tsujigiri/mendeley.git', branch: 'paging_search'
gem 'mendeley', github: 'tsujigiri/mendeley', branch: 'paging_search'
gem 'plos', github: 'tsujigiri/plos', branch: 'master', require: false
# New Relic monitoring, off by default in development
gem 'newrelic_rpm'
@@ -37,13 +38,13 @@ gem 'sunspot_solr', '2.0.0'
gem 'rubyzip', :git => 'git://github.com/rubyzip/rubyzip.git'
gem "will_paginate"
gem 'nested_form', :git => 'git://github.com/ryanb/nested_form.git'
gem 'nested_form', github: 'ryanb/nested_form'
gem 'json'
gem 'mediawiki-gateway'
gem 'activerecord-import', '~> 0.2.11'
gem 'paperclip', '~> 3.0'
gem 'friendly_id', :git => 'git://github.com/FriendlyId/friendly_id.git', branch: '4.0-stable' # the branch is for Rails 3
gem 'recommendify',:git => 'git://github.com/paulasmuth/recommendify.git', :ref => "34308c4"
gem 'friendly_id', github: 'FriendlyId/friendly_id', branch: '4.0-stable' # the branch is for Rails 3
gem 'recommendify', github: 'paulasmuth/recommendify', :ref => "34308c4"
# for jobs
gem 'sidekiq'
@@ -78,9 +79,9 @@ group :test do
gem 'factory_girl'
gem 'mocha', require: false
gem 'debugger'
gem 'sunspot_test', git: 'git://github.com/tsujigiri/sunspot_test.git', branch: 'dirty_quickfix'
#gem "turn", "< 0.8.3" # truncates backtraces in the tests (bad)
gem 'sunspot_test', github: 'tsujigiri/sunspot_test', branch: 'dirty_quickfix'
gem 'simplecov', require: false
gem 'webmock'
end
group :development, :test do

View File

@@ -35,6 +35,15 @@ GIT
rest-client
yajl-ruby
GIT
remote: git://github.com/tsujigiri/plos.git
revision: de96b3a5d2868a89cecf6de5987fb204f0f1e6dc
branch: master
specs:
plos (0.0.6)
nokogiri
rest-client
GIT
remote: git://github.com/tsujigiri/sunspot_test.git
revision: eb70e7cb2a7bc9b57ac892c901cac47a6ab04ad6
@@ -88,6 +97,7 @@ GEM
activesupport (3.2.16)
i18n (~> 0.6, >= 0.6.4)
multi_json (~> 1.0)
addressable (2.3.5)
arbre (1.0.1)
activesupport (>= 3.0.0)
arel (3.0.3)
@@ -118,6 +128,8 @@ GEM
coffee-script-source (1.6.3)
columnize (0.3.6)
connection_pool (1.1.0)
crack (0.4.1)
safe_yaml (~> 0.9.0)
debugger (1.6.2)
columnize (>= 0.3.1)
debugger-linecache (~> 1.2.0)
@@ -244,6 +256,7 @@ GEM
builder (>= 2.1.2)
rvm-capistrano (1.4.4)
capistrano (>= 2.15.4)
safe_yaml (0.9.7)
sanitize (2.0.6)
nokogiri (>= 1.4.4)
sass (3.2.12)
@@ -307,6 +320,9 @@ GEM
rack (>= 1.0.0)
warden (1.2.3)
rack (>= 1.0)
webmock (1.16.0)
addressable (>= 2.2.7)
crack (>= 0.3.2)
will_paginate (3.0.5)
yajl-ruby (1.1.0)
yui-compressor (0.12.0)
@@ -320,7 +336,7 @@ DEPENDENCIES
authlogic
bartt-ssl_requirement (~> 1.4.0)
bcrypt-ruby
capistrano
capistrano (~> 2.0)
coffee-script
debugger
devise (= 3.0.0)
@@ -343,6 +359,7 @@ DEPENDENCIES
newrelic_rpm
paperclip (~> 3.0)
pg
plos!
rails (~> 3.2.16)
rails3-generators
recaptcha
@@ -365,5 +382,6 @@ DEPENDENCIES
uglifier
uuidtools
vegas
webmock
will_paginate
yui-compressor

View File

@@ -29,7 +29,7 @@ class SnpsController < ApplicationController
@total_genotypes = @snp.genotype_frequency.map {|k,v| v }.sum
@total_alleles = @snp.allele_frequency.map {|k,v| v }.sum
Sidekiq::Client.enqueue(Plos, @snp.id)
Sidekiq::Client.enqueue(PlosSearch, @snp.id)
Sidekiq::Client.enqueue(MendeleySearch, @snp.id)
Sidekiq::Client.enqueue(Snpedia, @snp.id)

View File

@@ -33,9 +33,9 @@ class Snp < ActiveRecord::Base
snps = Snp.select([ :id, :mendeley_updated, :snpedia_updated, :plos_updated ]).
where([ 'mendeley_updated < ? or snpedia_updated < ? or plos_updated < ?',
max_age, max_age, max_age ]).find_each do |snp|
Sidekiq::Client.enqueue(Mendeley, snp.id) if snp.mendeley_updated < max_age
Sidekiq::Client.enqueue(Snpedia, snp.id) if snp.snpedia_updated < max_age
Sidekiq::Client.enqueue(Plos, snp.id) if snp.plos_updated < max_age
Sidekiq::Client.enqueue(Mendeley, snp.id) if snp.mendeley_updated < max_age
Sidekiq::Client.enqueue(Snpedia, snp.id) if snp.snpedia_updated < max_age
Sidekiq::Client.enqueue(PlosSearch, snp.id) if snp.plos_updated < max_age
end
end
@@ -44,4 +44,21 @@ class Snp < ActiveRecord::Base
Sidekiq::Client.enqueue(Frequency,s.id)
end
end
# TODO: move to after hook, checking whether one of the *_updated attributes
# has changed and updating the ranking if so.
def update_ranking
self.ranking =
mendeley_paper.count
+ 2 * plos_paper.count
+ 5 * snpedia_paper.count
+ 2 * genome_gov_paper.count
+ 2 * pgp_annotation.count
end
def plos_updated!
self.plos_updated = Time.current
update_ranking
save
end
end

View File

@@ -1,93 +0,0 @@
require 'net/http'
require 'rexml/document'
class Plos
include Sidekiq::Worker
sidekiq_options :queue => :plos, :retry => 5, :unique => true
def is_illegal_snp(name)
# we don't need mitochondrial or VG-SNPs as these just result in noise
# from the PLOS API
forbidden_names = ["mt-", "vg"]
if forbidden_names.any? { |part| name[part] }
log "plos: Snp #{name} is a mitochondrial or vg snp"
return true
end
end
def is_old_snp(snp)
# we don't need to update snps that have been updated in the last month
if snp.plos_updated > 31.days.ago
log "plos: time threshold for #{snp.name} not met"
return true
end
end
def perform(snp_id)
# Logging stuff
Rails.logger.level = 0
Rails.logger = Logger.new("#{Rails.root}/log/plos_#{Rails.env}.log")
# Get SNP
@snp = Snp.find(snp_id)
return false if is_illegal_snp(@snp.name) or is_old_snp(@snp)
key_handle = File.open(::Rails.root.to_s+"/key_plos.txt")
api_key = key_handle.readline.rstrip
url = "http://api.plos.org/search?q="+@snp.name+"&api_key="+api_key
begin
xml_data = Net::HTTP.get_response(URI.parse(url)).body
rescue # yep, this sucks, but the http-parser likes to break down without any reason, so this retries...
retry
end
doc = REXML::Document.new(xml_data)
all_elements = doc.elements.to_a
# check if there are any papers to add...
if all_elements[0][2].index("numFound='0'") != -1
log "plos: none found\n"
return false
end
log "plos: got papers"
log "all elements: #{all_elements}"
log "Checking: #{all_elements[0][2]}"
all_elements[0][2].each do |singleton|
log "plos: Looking at: #{singleton}"
log "Trying #{singleton[7]}"
first_author = singleton[2][0].to_s.gsub!(/<\/?str>/,"")
log "first author: #{first_author}"
doi = singleton[4].to_s.gsub!(/<\/?str( name='id')?>/,"")
log "doi: #{doi}"
pub_date = singleton[6].to_s.gsub!(/<\/?date( name='publication_date')?>/,"")
log "pub_date: #{pub_date}"
title = CGI.unescapeHTML(singleton[7].to_s.gsub!(/<\/?str( name='title_display')?>/,""))
log "full title: #{title}"
if PlosPaper.find_all_by_doi(doi) == []
@plos_paper = PlosPaper.new(:first_author => first_author, :doi => doi, :title => title, :pub_date => pub_date, :snp_id => @snp.id)
@plos_paper.save
log "-> written new paper\n"
@snp.ranking = @snp.mendeley_paper.count + 2*@snp.plos_paper.count + 5*@snp.snpedia_paper.count + 2*@snp.genome_gov_paper.count + 2*@snp.pgp_annotation.count
else
log "-> paper is old"
@plos_paper = PlosPaper.find_by_doi(doi)
end
Sidekiq::Client.enqueue(PlosDetails,@plos_paper)
end
@snp.plos_updated = Time.zone.now
@snp.save
log "plos: sleep for 5 secs\n"
sleep(5)
end
def log msg
Rails.logger.info "#{DateTime.now}: #{msg}"
end
end

View File

@@ -0,0 +1,77 @@
# See http://api.plos.org/solr/faq/#solr_api_recommended_usage for API limits
require 'plos'
class PlosSearch
include Sidekiq::Worker
sidekiq_options queue: :plos, unique: true
attr_reader :snp, :client
def perform(snp_id)
setup_logger
@snp = Snp.where(id: snp_id).first
return false if @snp.nil? || snp_illegal? || recently_updated?
@client = PLOS::Client.new(self.class.api_key)
articles = perform_search
articles.each do |article|
import_article(article)
end
snp.plos_updated!
sleep(6) # honoring API limits
end
def import_article(article)
plos_paper_attributes = {
first_author: article.authors.first.to_s,
doi: article.id,
pub_date: article.published_at,
title: article.title,
snp_id: snp.id,
}
plos_paper = PlosPaper.find_or_initialize_by_doi(plos_paper_attributes[:doi])
plos_paper.update_attributes!(plos_paper_attributes)
Sidekiq::Client.enqueue(PlosDetails, plos_paper.id)
end
def perform_search
# honoring API limits
Timeout.timeout(5) do
client.search(snp.name)
end
end
def snp_illegal?
# we don't need mitochondrial or VG-SNPs as these just result in noise
# from the PLOS API
forbidden_names = ["mt-", "vg"]
if forbidden_names.any? { |part| snp.name[part] }
log "Snp #{snp.name} is a mitochondrial or vg snp"
true
else
false
end
end
def recently_updated?
# we don't need to update snps that have been updated in the last month
if snp.plos_updated > 31.days.ago
log "time threshold for #{snp.name} not met"
true
else
false
end
end
def setup_logger
Rails.logger = Logger.new(Rails.root.join("log/plos_#{Rails.env}.log"))
Rails.logger.level = 0
end
def log(msg)
Rails.logger.info "#{DateTime.now}: #{msg}"
end
def self.api_key
# TODO: put in APP_CONFIG
File.read(Rails.root.join("key_plos.txt")).strip
end
end

View File

@@ -3,32 +3,38 @@ require "net/http"
require "json"
class PlosDetails
include Sidekiq::Worker
sidekiq_options :queue => :plos_details, :retry => 5, :unique => true
include Sidekiq::Worker
sidekiq_options :queue => :plos_details, :retry => 5, :unique => true
def perform(plos_paper)
@Plos_paper = PlosPaper.find_by_id(plos_paper["plos_paper"]["id"].to_i)
key_handle = File.open(::Rails.root.to_s+"/key_plos.txt")
api_key = key_handle.readline.rstrip
detail_url = "http://alm.plos.org/articles/" + @Plos_paper.doi + ".json?api_key="+api_key
begin
detail_resp = Net::HTTP.get_response(URI.parse(detail_url))
rescue
retry
def perform(plos_paper)
plos_paper_id =
if plos_paper.is_a?(Hash)
plos_paper["plos_paper"]["id"].to_i
else
plos_paper.to_i
end
detail_data = detail_resp.body
detail_result = JSON.parse(detail_data)
plos_paper = PlosPaper.find_by_id(plos_paper_id)
key_handle = File.open(::Rails.root.to_s+"/key_plos.txt")
api_key = key_handle.readline.rstrip
print "plos details: updated reader-status\n"
print detail_result
readers_total = detail_result["article"]["events_count"].to_i
@Plos_paper.reader = readers_total.to_i
@Plos_paper.save
print "-> sleep for 5 secs\n\n"
sleep(5)
end
detail_url = "http://alm.plos.org/articles/" + plos_paper.doi + ".json?api_key="+api_key
begin
detail_resp = Net::HTTP.get_response(URI.parse(detail_url))
rescue
retry
end
detail_data = detail_resp.body
detail_result = JSON.parse(detail_data)
print "plos details: updated reader-status\n"
print detail_result
readers_total = detail_result["article"]["events_count"].to_i
plos_paper.reader = readers_total.to_i
plos_paper.save
print "-> sleep for 6 secs\n\n"
sleep(6)
end
end

View File

@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="UTF-8"?>
<response>
<result name="response" numFound="47" start="0" maxScore="1.2268105">
<doc>
<str name="id">10.1371/journal.pone.0013771</str>
<str name="journal">PLoS ONE</str>
<str name="eissn">1932-6203</str>
<date name="publication_date">2010-10-29T00:00:00Z</date>
<str name="article_type">Research Article</str>
<arr name="author_display">
<str>Ester Aparicio</str>
<str>Mariona Parera</str>
<str>Sandra Franco</str>
<str>Nuria Pérez-Alvarez</str>
<str>Cristina Tural</str>
<str>Bonaventura Clotet</str>
<str>Miguel Angel Martínez</str>
</arr>
<arr name="abstract">
<str>
Recent genome-wide association studies report that the SNP rs8099917, located 8.9 kb upstream of the start codon of IL28B, is associated with both disease chronicity and therapeutic response to pegIFN-α and RBV in patients infected with genotype 1 HCV. To determine the effect of rs8099917 variation on the response of HCV to therapy, we genotyped this variant in a cohort of 160 HCV/HIV-1 coinfected patients in our clinic unit who received combined peg-IFN-α/RBV therapy. The rs8099917 T/G or G/G genotypes were observed in 56 patients (35%). Treatment failure occurred in 80% of G-allele carriers versus 48% of non-carriers (P&lt;0.0001). This result reveals that the G allele was strongly associated with treatment failure in this patient cohort. Importantly, a highly significant association was found between the G-allele and response to therapy in HCV genotype 1-infected patients (P&lt;0.0001) but not in HCV genotype 3-infected patients. Multivariate analysis (odds ratio; 95% confidence interval; P value) indicated that the rs8099917 TT genotype was a strong predictor of treatment success (5.83; 1.2626.92; P=0.021), independent of baseline plasma HCV-RNA load less than 500 000 IU/ml (4.85; 1.1819.95; P=0.025) and absence of advanced liver fibrosis (5.24; 1.2022.91; P=0.025). These results reveal the high prevalence of the rs8099917 G allele in HCV/HIV-1 coinfected patients as well as its strong association with treatment failure in HCV genotype 1-infected patients. rs8099917 SNP genotyping may be a valid pre-treatment predictor of which patients are likely to respond to treatment in this group of difficult-to-treat HCV/HIV-infected patients.
</str>
</arr>
<str name="title_display">IL28B SNP rs8099917 Is Strongly Associated with Pegylated Interferon-α and Ribavirin Therapy Treatment Failure in HCV/HIV-1 Coinfected Patients</str>
<float name="score">1.2268105</float>
</doc>
</result>
</response>

View File

@@ -7,11 +7,14 @@ require "shoulda-context"
require "mocha/setup"
require 'rails/test_help'
require "authlogic/test_case"
require 'webmock/test_unit'
WebMock.disable_net_connect!(:allow_localhost => true)
SunspotTest.solr_startup_timeout = 30
require 'sunspot_test/test_unit'
require 'factory_girl'
FactoryGirl.find_definitions
require 'paperclip/matchers'
require 'plos'
class ActiveSupport::TestCase
extend Paperclip::Shoulda::Matchers

View File

@@ -0,0 +1,29 @@
require_relative '../test_helper'
class PlosSearchTest < ActiveSupport::TestCase
context "worker" do
setup do
@snp = FactoryGirl.create(:snp)
end
should "associate new paper with snp" do
response = File.read(Rails.root.join('test/data/plos_search_response.xml'))
stub_request(:post, "api.plos.org/search").
with(body: { 'api_key' => 'xxx', 'q' => @snp.name, 'rows' => '50', 'start' => '0' }).
to_return(status: 200, body: response)
PlosSearch.stubs(:api_key).returns('xxx')
Sidekiq::Client.expects(:enqueue).with(PlosDetails, instance_of(Fixnum))
assert_difference(-> { PlosPaper.count }) do
PlosSearch.new.perform(@snp.id)
end
@snp.reload
assert @snp.plos_updated
plos_paper = PlosPaper.last
assert_equal @snp, plos_paper.snp
assert_equal 'Ester Aparicio', plos_paper.first_author
assert_equal '10.1371/journal.pone.0013771', plos_paper.doi
assert_equal DateTime.new(2010, 10, 29), plos_paper.pub_date
assert_match 'rs8099917', plos_paper.title
end
end
end

View File

@@ -11,9 +11,9 @@ class SnpTest < ActiveSupport::TestCase
@snp.mendeley_updated = @snp.snpedia_updated = @snp.plos_updated = 32.days.ago
@snp.save
queue = sequence('queue')
Sidekiq::Client.expects(:enqueue).with(Mendeley, @snp.id).in_sequence(queue)
Sidekiq::Client.expects(:enqueue).with(Snpedia, @snp.id).in_sequence(queue)
Sidekiq::Client.expects(:enqueue).with(Plos, @snp.id).in_sequence(queue)
Sidekiq::Client.expects(:enqueue).with(Mendeley, @snp.id).in_sequence(queue)
Sidekiq::Client.expects(:enqueue).with(Snpedia, @snp.id).in_sequence(queue)
Sidekiq::Client.expects(:enqueue).with(PlosSearch, @snp.id).in_sequence(queue)
Snp.update_papers
end