mirror of
https://github.com/chenasraf/snpr.git
synced 2026-05-18 01:39:01 +00:00
git pushMerge branch 'master' of github.com:gedankenstuecke/snpr
This commit is contained in:
45
.gitignore
vendored
45
.gitignore
vendored
@@ -1,32 +1,31 @@
|
||||
mail_username.txt
|
||||
mail_password.txt
|
||||
public/system/images/*
|
||||
dump.rdb
|
||||
sunspot-solr-development.log.*
|
||||
key_mendeley.txt
|
||||
key_plos.txt
|
||||
api_key.txt
|
||||
.bundle
|
||||
config/app_config.yml
|
||||
config/database.yml
|
||||
config/newrelic.yml
|
||||
coverage
|
||||
db/*.sqlite3
|
||||
log/*.log
|
||||
tmp/pids/server.pid
|
||||
tmp/cache
|
||||
public/data/*23andme*
|
||||
development.log
|
||||
dump.rdb
|
||||
hiredis/
|
||||
key_mendeley.txt
|
||||
key_plos.txt
|
||||
log/
|
||||
log/*.log
|
||||
mail_password.txt
|
||||
mail_username.txt
|
||||
/public/data/
|
||||
/public/system/images/
|
||||
.sass-cache/
|
||||
secret_token
|
||||
server.pid
|
||||
*.swp
|
||||
solr/data
|
||||
solr/pids
|
||||
config/database.yml
|
||||
sunspot-solr-development.log.*
|
||||
*.swp
|
||||
tmp/
|
||||
tmp/cache
|
||||
tmp/pids/server.pid
|
||||
.vagrant
|
||||
vendor/bundle
|
||||
vendor/cache
|
||||
coverage
|
||||
hiredis/
|
||||
log/
|
||||
secret_token
|
||||
config/app_config.yml
|
||||
config/newrelic.yml
|
||||
tmp/
|
||||
.vagrant
|
||||
.sass-cache/
|
||||
public/data/zip/opensnp_datadump.current.zip
|
||||
|
||||
@@ -14,8 +14,9 @@ before_script:
|
||||
- echo "bar" > mail_password.txt
|
||||
- echo "f7264dd590e09007703723a75550c748" > secret_token
|
||||
- psql -c 'create database snpr_test;' -U postgres
|
||||
# script: "bundle install" # this is automatically run by travis-ci
|
||||
- bundle exec rake db:setup
|
||||
services:
|
||||
- redis-server
|
||||
script: "bundle exec rake --trace db:migrate test"
|
||||
|
||||
addons:
|
||||
postgresql: "9.2"
|
||||
script: "bundle exec rake test"
|
||||
|
||||
3
Gemfile
3
Gemfile
@@ -22,11 +22,10 @@ gem 'plos', require: false
|
||||
# New Relic monitoring, off by default in development
|
||||
gem 'newrelic_rpm'
|
||||
|
||||
# workaround for bug in Fedora
|
||||
|
||||
# DB
|
||||
gem 'pg'
|
||||
gem 'activerecord-import', '~> 0.2.11'
|
||||
gem 'composite_primary_keys'
|
||||
|
||||
# for solr (indexing, searching)
|
||||
gem 'sunspot_rails'#, '2.0.0'
|
||||
|
||||
@@ -127,6 +127,8 @@ GEM
|
||||
coffee-script-source
|
||||
execjs
|
||||
coffee-script-source (1.6.3)
|
||||
composite_primary_keys (5.0.14)
|
||||
activerecord (~> 3.2.0, >= 3.2.9)
|
||||
connection_pool (1.1.0)
|
||||
crack (0.4.1)
|
||||
safe_yaml (~> 0.9.0)
|
||||
@@ -396,6 +398,7 @@ DEPENDENCIES
|
||||
capistrano (~> 2.0)
|
||||
capybara
|
||||
coffee-script
|
||||
composite_primary_keys
|
||||
database_cleaner
|
||||
devise (= 3.0.0)
|
||||
dynamic_form
|
||||
|
||||
@@ -84,4 +84,19 @@ default :from => "donotreply@opensnp.org"
|
||||
puts "http://"+ActionMailer::Base.default_url_options[:host]+@link
|
||||
end
|
||||
|
||||
def finished_parsing(genotype_id, stats)
|
||||
genotype = Genotype.find(genotype_id)
|
||||
@user = genotype.user
|
||||
@stats = stats
|
||||
@vendor = {
|
||||
'ftdna-illumina' => 'FamilyTreeDNA',
|
||||
'23andme' => '23andMe',
|
||||
'IYG' => 'Inside Your Genome',
|
||||
'decodeme' => 'deCODEme',
|
||||
'23andme-exome-vcf' => '23andMe',
|
||||
'ancestry' => 'Ancestry',
|
||||
}.fetch(genotype.filetype)
|
||||
|
||||
mail(to: @user.email, subject: 'Finished parsing your genotyping')
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
class Achievement < ActiveRecord::Base
|
||||
attr_accessible :award,:short_name
|
||||
has_many :user_achievements
|
||||
attr_accessible :award,:short_name
|
||||
has_many :user_achievements
|
||||
|
||||
searchable do
|
||||
text :award
|
||||
end
|
||||
searchable do
|
||||
text :award
|
||||
end
|
||||
end
|
||||
|
||||
@@ -26,11 +26,11 @@ class Genotype < ActiveRecord::Base
|
||||
end
|
||||
|
||||
def parse_genotype
|
||||
Sidekiq::Client.enqueue(Preparsing, id)
|
||||
Preparsing.perform_async(id)
|
||||
end
|
||||
|
||||
def delete_genotype
|
||||
Sidekiq::Client.enqueue(DeleteGenotype, { genotype_id: id })
|
||||
DeleteGenotype.perform_async(genotype_id: id)
|
||||
end
|
||||
|
||||
Paperclip.interpolates :fs_filename do |attachment, style|
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
class Snp < ActiveRecord::Base
|
||||
has_many :user_snps, foreign_key: :snp_name, primary_key: :name,
|
||||
dependent: :destroy
|
||||
has_many :user_snps, foreign_key: :snp_name, primary_key: :name
|
||||
has_many :users, through: :user_snps
|
||||
has_many :pgp_annotations
|
||||
has_many :snp_references
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
class UserSnp < ActiveRecord::Base
|
||||
belongs_to :snp, foreign_key: :snp_name, primary_key: :name,
|
||||
counter_cache: true
|
||||
self.primary_keys = [:genotype_id, :snp_name]
|
||||
belongs_to :snp, foreign_key: :snp_name, primary_key: :name, counter_cache: true
|
||||
has_one :user, through: :genotype
|
||||
belongs_to :genotype
|
||||
|
||||
|
||||
@@ -6,6 +6,9 @@
|
||||
<% if @unkown_chromosome[i] == false %>
|
||||
<SEGMENT id="<%= @id[i] %>" version="1.0" <%if @has_start[i] == true %> start="<%= @start_and_end[i][0] %>" stop="<%= @start_and_end[i][1] %>"<%else%> start="1" stop="" <%end%>>
|
||||
<% @user_snps[i].each do |us| %>
|
||||
<% if us.snp.nil? %>
|
||||
<% next %>
|
||||
<% end %>
|
||||
<FEATURE id="<%= us.snp.name %>">
|
||||
<TYPE id="<%= us.local_genotype %>" />
|
||||
<METHOD id="" />
|
||||
|
||||
10
app/views/user_mailer/finished_parsing.text.erb
Normal file
10
app/views/user_mailer/finished_parsing.text.erb
Normal file
@@ -0,0 +1,10 @@
|
||||
Hello <%= @user.name %>,
|
||||
|
||||
I just finished parsing your <%= @vendor %> file. I found a total of
|
||||
<%= @stats[:rows_without_comments] %> rows (exclusive comments) in it and was
|
||||
able to import <%= @stats[:rows_after_parsing] %> SNPs.
|
||||
|
||||
If this seems way off, feel free to contact us!
|
||||
|
||||
Cheers,
|
||||
The OpenSNP Parser
|
||||
@@ -11,9 +11,8 @@ class DeleteGenotype
|
||||
# This user_snp is the only one, so, destroy the Snp,
|
||||
# which destroys the UserSnp implicitly
|
||||
us.snp.destroy
|
||||
else
|
||||
us.destroy
|
||||
end
|
||||
us.destroy
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -2,139 +2,227 @@ class Parsing
|
||||
include Sidekiq::Worker
|
||||
sidekiq_options :queue => :parse, :retry => 5, :unique => true
|
||||
|
||||
def perform(genotype_id, temp_file)
|
||||
Rails.logger.level = 0
|
||||
Rails.logger = Logger.new("#{Rails.root}/log/parsing_#{Rails.env}.log")
|
||||
genotype_id = genotype_id["genotype"]["id"].to_i if genotype_id.is_a?(Hash)
|
||||
attr_reader :genotype, :temp_table_name, :tempfile, :stats, :start_time
|
||||
|
||||
def perform(genotype_id)
|
||||
@stats = {}
|
||||
@start_time = Time.current
|
||||
logger.info("Started parsing genotype with id #{genotype_id}")
|
||||
@genotype = Genotype.find(genotype_id)
|
||||
|
||||
if @genotype.filetype != "other"
|
||||
# IYG filetype needs proper dbSNP-names
|
||||
if @genotype.filetype == "IYG"
|
||||
db_snp_snps = {"MT-T3027C"=>"rs199838004", "MT-T4336C"=>"rs41456348", "MT-G4580A"=>"rs28357975", "MT-T5004C"=>"rs41419549", "MT-C5178a"=>"rs28357984", "MT-A5390G"=>"rs41333444", "MT-C6371T"=>"rs41366755", "MT-G8697A"=>"rs28358886", "MT-G9477A"=>"rs2853825", "MT-G10310A"=>"rs41467651", "MT-A10550G"=>"rs28358280", "MT-C10873T"=>"rs2857284", "MT-C11332T"=>"rs55714831", "MT-A11947G"=>"rs28359168", "MT-A12308G"=>"rs2853498", "MT-A12612G"=>"rs28359172", "MT-T14318C"=>"rs28357675", "MT-T14766C"=>"rs3135031", "MT-T14783C"=>"rs28357680"}
|
||||
end
|
||||
stats[:filetype] = genotype.filetype
|
||||
stats[:genotype_id] = genotype.id
|
||||
@temp_table_name = "user_snps_temp_#{genotype.id}"
|
||||
@tempfile = Tempfile.new("snpr_genotype_#{genotype.id}_")
|
||||
|
||||
genotype_file = File.open(temp_file, "r")
|
||||
log "Loading known Snps."
|
||||
known_snps = Snp.pluck(:name).to_set
|
||||
user_genotype_ids = @genotype.user.genotypes.pluck(:id)
|
||||
known_user_snps = UserSnp.where(genotype_id: user_genotype_ids).
|
||||
pluck('distinct(snp_name)').to_set
|
||||
|
||||
new_snps = []
|
||||
new_user_snps = []
|
||||
create_temp_table
|
||||
normalize_csv
|
||||
copy_csv_into_temp_table
|
||||
insert_into_snps
|
||||
insert_into_user_snps
|
||||
notify_user
|
||||
|
||||
log "Parsing file #{temp_file}"
|
||||
# open that file, go through each line
|
||||
genotype_file.each do |single_snp|
|
||||
next if single_snp[0] == "#"
|
||||
logger.info("Finished parsing genotype with id #{genotype.id}, cleaning up.")
|
||||
rescue => e
|
||||
logger.error("Failed with #{e.class}: #{e.message}")
|
||||
raise
|
||||
ensure
|
||||
drop_temp_table
|
||||
# TODO: Why doesn't `tempfile.unlink` work here?
|
||||
File.delete(tempfile.path)
|
||||
stats[:duration] = "#{(Time.current - start_time).round(3)}s"
|
||||
logger.info("Stats: #{stats.to_a.map { |s| s.join('=') }.join(', ')}")
|
||||
end
|
||||
|
||||
# make a nice array if line is no comment
|
||||
if @genotype.filetype == "IYG"
|
||||
prior_snp_array = single_snp.gsub("\n","").split("\t")
|
||||
name = prior_snp_array[0]
|
||||
if name.starts_with? "MT"
|
||||
# check whether it's in db_snp_snps, use that name
|
||||
position = name.tr('A-Za-z-','') # MT-G1234G -> 1234
|
||||
def create_temp_table
|
||||
execute("drop table if exists #{temp_table_name}")
|
||||
execute(<<-SQL)
|
||||
create table #{temp_table_name} (
|
||||
genotype_id int,
|
||||
snp_name varchar(32),
|
||||
chromosome varchar(32),
|
||||
position varchar(32),
|
||||
local_genotype char(2)
|
||||
)
|
||||
SQL
|
||||
end
|
||||
|
||||
if db_snp_snps[name] # do we have a dbSNP-name?
|
||||
name = db_snp_snps[name]
|
||||
end
|
||||
|
||||
snp_array = [name, "MT", position, prior_snp_array[1]]
|
||||
else
|
||||
snp_array = [prior_snp_array[0], "1", "1", prior_snp_array[1]]
|
||||
end
|
||||
log "SNP_ARRAY IS"
|
||||
log snp_array
|
||||
elsif @genotype.filetype == "23andme"
|
||||
snp_array = single_snp.split("\t")
|
||||
|
||||
elsif @genotype.filetype == "ancestry"
|
||||
temp_array = single_snp.split("\t")
|
||||
if temp_array[0] != "rsid"
|
||||
snp_array = [temp_array[0],temp_array[1],temp_array[2],temp_array[3]+temp_array[4]]
|
||||
else
|
||||
next
|
||||
end
|
||||
|
||||
elsif @genotype.filetype == "decodeme"
|
||||
temp_array = single_snp.split(",")
|
||||
if temp_array[0] != "Name"
|
||||
snp_array = [temp_array[0],temp_array[2],temp_array[3],temp_array[5]]
|
||||
else
|
||||
next
|
||||
end
|
||||
|
||||
elsif @genotype.filetype == "ftdna-illumina"
|
||||
temp_array = single_snp.split("\",\"")
|
||||
if temp_array[0].index("RSID") == nil
|
||||
if temp_array[0] != nil and temp_array[1] != nil and temp_array[2] != nil and temp_array[3] != nil
|
||||
snp_array = [temp_array[0].gsub("\"",""),temp_array[1].gsub("\"",""),temp_array[2].gsub("\"",""),temp_array[3].gsub("\"","").rstrip]
|
||||
else
|
||||
UserMailer.parsing_error(@genotype.user_id).deliver
|
||||
break
|
||||
end
|
||||
else
|
||||
next
|
||||
end
|
||||
|
||||
elsif @genotype.filetype == "23andme-exome-vcf"
|
||||
temp_array = single_snp.split("\t")
|
||||
@format_array = temp_array[-2].split(":")
|
||||
@format_array.each_with_index do |element,index|
|
||||
if element == "GT"
|
||||
@genotype_position = index
|
||||
end
|
||||
end
|
||||
@genotype_non_parsed = temp_array[-1].split(":")[@genotype_position].split("/")
|
||||
@genotype_parsed = ""
|
||||
@genotype_non_parsed.each do |allele|
|
||||
if allele == "0"
|
||||
@genotype_parsed = @genotype_parsed + temp_array[3]
|
||||
elsif allele == "1"
|
||||
@genotype_parsed = @genotype_parsed + temp_array[4]
|
||||
end
|
||||
end
|
||||
snp_array = [temp_array[2].downcase,temp_array[0],temp_array[1],@genotype_parsed.upcase]
|
||||
|
||||
unless known_snps.include?(snp_array[0].downcase)
|
||||
next
|
||||
end
|
||||
end
|
||||
def drop_temp_table
|
||||
execute("drop table #{temp_table_name}")
|
||||
end
|
||||
|
||||
if snp_array[0] && snp_array[1] && snp_array[2] && snp_array[3] && snp_array[3].strip.length == 2
|
||||
# if we do not have the fitting SNP, make one and parse all paper-types for it
|
||||
|
||||
unless known_snps.include?(snp_array[0].downcase)
|
||||
snp = Snp.new(:name => snp_array[0].downcase, :chromosome => snp_array[1], :position => snp_array[2], :ranking => 0, :user_snps_count => 1)
|
||||
snp.default_frequencies
|
||||
new_snps << snp
|
||||
end
|
||||
|
||||
if known_user_snps.include?(snp_array[0].downcase)
|
||||
log "already known user-snp"
|
||||
else
|
||||
new_user_snps << [ @genotype.id, snp_array[0].downcase, snp_array[3].rstrip ]
|
||||
end
|
||||
else
|
||||
UserMailer.parsing_error(@genotype.user_id).deliver
|
||||
break
|
||||
end
|
||||
end
|
||||
log "Importing #{new_snps.length} new Snps"
|
||||
Snp.import new_snps
|
||||
def normalize_csv
|
||||
rows = File.readlines(genotype.genotype.path)
|
||||
.reject { |line| line.start_with?('#') } # Skip comments
|
||||
stats[:rows_without_comments] = rows.length
|
||||
csv = send(:"parse_#{genotype.filetype.sub('-', '_').downcase}", rows)
|
||||
known_chromosomes = ['MT', 'X', 'Y', (1..22).map(&:to_s)].flatten
|
||||
csv.select! do |row|
|
||||
# snp name
|
||||
row[1].present? &&
|
||||
# chromosome
|
||||
known_chromosomes.include?(row[2]) &&
|
||||
# position
|
||||
row[3].to_i >= 1 && row[3].to_i <= 249_250_621 &&
|
||||
# local genotype
|
||||
row[4].is_a?(String) && (1..2).include?(row[4].length)
|
||||
end
|
||||
stats[:rows_after_parsing] = csv.length
|
||||
tempfile.write(csv.map { |row| row.join(',') }.join("\n"))
|
||||
tempfile.close
|
||||
FileUtils.chmod(0644, tempfile.path)
|
||||
end
|
||||
|
||||
log "Importing new UserSnps"
|
||||
user_snp_columns = [:genotype_id, :snp_name, :local_genotype]
|
||||
UserSnp.import user_snp_columns, new_user_snps, validate: false
|
||||
log "Done."
|
||||
puts "done with #{temp_file}"
|
||||
system("rm #{temp_file}")
|
||||
def copy_csv_into_temp_table
|
||||
execute(<<-SQL)
|
||||
copy #{temp_table_name} (
|
||||
genotype_id,
|
||||
snp_name,
|
||||
chromosome,
|
||||
position,
|
||||
local_genotype
|
||||
)
|
||||
from '#{tempfile.path}'
|
||||
with (FORMAT CSV, HEADER FALSE, DELIMITER ',')
|
||||
SQL
|
||||
end
|
||||
|
||||
def insert_into_snps
|
||||
time = Time.now.utc.iso8601
|
||||
execute(<<-SQL)
|
||||
insert into snps (name, chromosome, position, created_at, updated_at, user_snps_count)
|
||||
(
|
||||
select
|
||||
#{temp_table_name}.snp_name,
|
||||
#{temp_table_name}.chromosome,
|
||||
#{temp_table_name}.position,
|
||||
'#{time}',
|
||||
'#{time}',
|
||||
1
|
||||
from #{temp_table_name}
|
||||
left join snps
|
||||
on #{temp_table_name}.snp_name = snps.name
|
||||
where
|
||||
snps.name is null
|
||||
)
|
||||
SQL
|
||||
end
|
||||
|
||||
def insert_into_user_snps
|
||||
execute(<<-SQL)
|
||||
insert into user_snps (snp_name, local_genotype, genotype_id)
|
||||
(
|
||||
select
|
||||
#{temp_table_name}.snp_name,
|
||||
#{temp_table_name}.local_genotype,
|
||||
#{temp_table_name}.genotype_id
|
||||
from #{temp_table_name}
|
||||
left join user_snps
|
||||
on user_snps.snp_name = #{temp_table_name}.snp_name
|
||||
and user_snps.genotype_id = #{temp_table_name}.genotype_id
|
||||
where user_snps.snp_name is null
|
||||
)
|
||||
SQL
|
||||
end
|
||||
|
||||
def parse_23andme(rows)
|
||||
rows.map do |row|
|
||||
fields = row.strip.split("\t")
|
||||
[
|
||||
genotype.id,
|
||||
fields[0],
|
||||
fields[1],
|
||||
fields[2],
|
||||
fields[3]
|
||||
]
|
||||
end
|
||||
end
|
||||
|
||||
def log msg
|
||||
Rails.logger.info "#{DateTime.now}: #{msg}"
|
||||
def parse_decodeme(rows)
|
||||
rows.shift if rows.first.start_with?('Name')
|
||||
rows.map do |row|
|
||||
fields = row.strip.split(',')
|
||||
[
|
||||
genotype.id,
|
||||
fields[0],
|
||||
fields[2],
|
||||
fields[3],
|
||||
fields[5]
|
||||
]
|
||||
end
|
||||
end
|
||||
|
||||
def parse_ancestry(rows)
|
||||
rows.shift if rows.first.start_with?('rsid')
|
||||
rows.map do |row|
|
||||
fields = row.strip.split("\t")
|
||||
[
|
||||
genotype.id,
|
||||
fields[0],
|
||||
fields[1],
|
||||
fields[2],
|
||||
"#{fields[3]}#{fields[4]}"
|
||||
]
|
||||
end
|
||||
end
|
||||
|
||||
def parse_ftdna_illumina(rows)
|
||||
rows.shift if rows.first.start_with?('RSID')
|
||||
rows.map do |row|
|
||||
fields = row.strip.split(',')
|
||||
[
|
||||
genotype.id,
|
||||
fields[0].gsub('"', ''),
|
||||
fields[1].gsub('"', ''),
|
||||
fields[2].gsub('"', ''),
|
||||
fields[3].gsub('"', '')
|
||||
]
|
||||
end
|
||||
end
|
||||
|
||||
def parse_iyg(rows)
|
||||
db_snp_names = {
|
||||
"MT-T3027C" => "rs199838004", "MT-T4336C" => "rs41456348",
|
||||
"MT-G4580A" => "rs28357975", "MT-T5004C" => "rs41419549",
|
||||
"MT-C5178a" => "rs28357984", "MT-A5390G" => "rs41333444",
|
||||
"MT-C6371T" => "rs41366755", "MT-G8697A" => "rs28358886",
|
||||
"MT-G9477A" => "rs2853825", "MT-G10310A" => "rs41467651",
|
||||
"MT-A10550G" => "rs28358280", "MT-C10873T" => "rs2857284",
|
||||
"MT-C11332T" => "rs55714831", "MT-A11947G" => "rs28359168",
|
||||
"MT-A12308G" => "rs2853498", "MT-A12612G" => "rs28359172",
|
||||
"MT-T14318C" => "rs28357675", "MT-T14766C" => "rs3135031",
|
||||
"MT-T14783C" => "rs28357680"
|
||||
}
|
||||
rows.map do |row|
|
||||
snp_name, local_genotype = row.split("\t")
|
||||
if snp_name.start_with?('MT')
|
||||
position = snp_name[/[0-9]+/]
|
||||
chromosome = 'MT'
|
||||
else
|
||||
position = chromosome = '1'
|
||||
end
|
||||
[
|
||||
genotype.id,
|
||||
db_snp_names.fetch(snp_name, snp_name),
|
||||
chromosome,
|
||||
position,
|
||||
local_genotype.strip
|
||||
]
|
||||
end
|
||||
end
|
||||
|
||||
def notify_user
|
||||
UserMailer.delay.finished_parsing(genotype.id, stats)
|
||||
end
|
||||
|
||||
def execute(sql)
|
||||
Genotype.connection.execute(sql)
|
||||
end
|
||||
|
||||
def logger
|
||||
return @logger if @logger
|
||||
@logger = Logger.new(Rails.root.join("log/parsing_#{Rails.env}.log"))
|
||||
@logger.formatter = Logger::Formatter.new
|
||||
@logger
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
@@ -6,17 +6,13 @@ class Preparsing
|
||||
sidekiq_options :queue => :preparse, :retry => 10, :unique => true # only retry 10 times - after that, the genotyping probably has already been deleted
|
||||
|
||||
def perform(genotype_id)
|
||||
Rails.logger.level = 0
|
||||
Rails.logger = Logger.new("#{Rails.root}/log/preparsing_#{Rails.env}.log")
|
||||
genotype_id = genotype_id["genotype"]["id"].to_i if genotype_id.is_a?(Hash)
|
||||
@genotype = Genotype.find(genotype_id)
|
||||
filename = "#{Rails.root}/public/data/#{@genotype.fs_filename}"
|
||||
|
||||
log "Starting preparse"
|
||||
genotype = Genotype.find(genotype_id)
|
||||
|
||||
logger.info "Starting preparse"
|
||||
biggest = ''
|
||||
biggest_size = 0
|
||||
begin
|
||||
Zip::File.open(filename) do |zipfile|
|
||||
Zip::File.open(genotype.genotype.path) do |zipfile|
|
||||
# find the biggest file, since that's going to be the genotyping
|
||||
zipfile.each do |entry|
|
||||
if entry.size > biggest_size
|
||||
@@ -24,108 +20,104 @@ class Preparsing
|
||||
biggest_size = entry.size
|
||||
end
|
||||
end
|
||||
|
||||
zipfile.extract(biggest,"#{Rails.root}/tmp/#{@genotype.fs_filename}.csv")
|
||||
system("mv #{Rails.root}/tmp/#{@genotype.fs_filename}.csv #{Rails.root}/public/data/#{@genotype.fs_filename}")
|
||||
log "copied file"
|
||||
|
||||
zipfile.extract(biggest,"#{Rails.root}/tmp/#{genotype.fs_filename}.csv")
|
||||
system("mv #{Rails.root}/tmp/#{genotype.fs_filename}.csv #{Rails.root}/public/data/#{genotype.fs_filename}")
|
||||
logger.info "copied file"
|
||||
end
|
||||
|
||||
|
||||
rescue
|
||||
log "nothing to unzip, seems to be a text-file in the first place"
|
||||
logger.info "nothing to unzip, seems to be a text-file in the first place"
|
||||
end
|
||||
|
||||
|
||||
# now that they are unzipped, check if they're actually proper files
|
||||
file_is_ok = false
|
||||
fh = File.open("#{Rails.root}/public/data/#{@genotype.fs_filename}")
|
||||
fh = File.open(genotype.genotype.path)
|
||||
l = fh.readline()
|
||||
# some files, for some reason, start with the UTF-BOM-marker
|
||||
l = l.sub("\uFEFF","")
|
||||
# iterate as long as there's commenting going on
|
||||
while l.start_with?("#")
|
||||
l = fh.readline()
|
||||
l = l.sub("\uFEFF","")
|
||||
l = fh.readline()
|
||||
l = l.sub("\uFEFF","")
|
||||
end
|
||||
|
||||
if @genotype.filetype == "23andme"
|
||||
# first non-comment line is of length 4 after split
|
||||
if l.split("\t").length == 4
|
||||
log "file is 23andme and is ok!"
|
||||
file_is_ok = true
|
||||
end
|
||||
elsif @genotype.filetype == "ancestry"
|
||||
if genotype.filetype == "23andme"
|
||||
# first non-comment line is of length 4 after split
|
||||
if l.split("\t").length == 4
|
||||
logger.info "file is 23andme and is ok!"
|
||||
file_is_ok = true
|
||||
end
|
||||
elsif genotype.filetype == "ancestry"
|
||||
# first line is of length 5
|
||||
if l.split("\t").length == 5
|
||||
file_is_ok = true
|
||||
log "file is ancestry and is ok!"
|
||||
file_is_ok = true
|
||||
logger.info "file is ancestry and is ok!"
|
||||
end
|
||||
elsif genotype.filetype == "decodeme"
|
||||
# first line is of length 6
|
||||
if l.split(",").length == 6
|
||||
file_is_ok = true
|
||||
logger.info "file is decodeme and is ok!"
|
||||
end
|
||||
elsif genotype.filetype == "ftdna-illumina"
|
||||
# first line is of length 4
|
||||
if l.split(",").length == 4
|
||||
file_is_ok = true
|
||||
logger.info "file is ftdna and is ok!"
|
||||
end
|
||||
elsif genotype.filetype == "23andme-exome-vcf"
|
||||
#first line is
|
||||
if l.split("\t").length == 10
|
||||
file_is_ok = true
|
||||
logger.info "file is 23andme-exome and is ok!"
|
||||
end
|
||||
elsif genotype.filetype == "IYG"
|
||||
if l.split("\t").length == 2
|
||||
file_is_ok = true
|
||||
logger.info "file is IYG and is ok!"
|
||||
end
|
||||
elsif @genotype.filetype == "decodeme"
|
||||
# first line is of length 6
|
||||
if l.split(",").length == 6
|
||||
file_is_ok = true
|
||||
log "file is decodeme and is ok!"
|
||||
end
|
||||
elsif @genotype.filetype == "ftdna-illumina"
|
||||
# first line is of length 4
|
||||
if l.split(",").length == 4
|
||||
file_is_ok = true
|
||||
log "file is ftdna and is ok!"
|
||||
end
|
||||
elsif @genotype.filetype == "23andme-exome-vcf"
|
||||
#first line is
|
||||
if l.split("\t").length == 10
|
||||
file_is_ok = true
|
||||
log "file is 23andme-exome and is ok!"
|
||||
end
|
||||
elsif @genotype.filetype == "IYG"
|
||||
if l.split("\t").length == 2
|
||||
file_is_ok = true
|
||||
log "file is IYG and is ok!"
|
||||
end
|
||||
end
|
||||
|
||||
log "Checking whether genotyping is duplicate"
|
||||
md5 = Digest::MD5.file("#{Rails.root}/public/data/#{@genotype.fs_filename}").to_s
|
||||
logger.info "Checking whether genotyping is duplicate"
|
||||
md5 = Digest::MD5.file("#{Rails.root}/public/data/#{genotype.fs_filename}").to_s
|
||||
file_is_duplicate = false
|
||||
Genotype.all.each do |g|
|
||||
other_md5 = g.md5sum
|
||||
if other_md5 == md5 and g.id != @genotype.id
|
||||
log "Genotyping #{filename} is already uploaded!\n"
|
||||
log "Genotyping #{g.fs_filename} has the same md5sum.\n"
|
||||
file_is_ok = false
|
||||
file_is_duplicate = true
|
||||
end
|
||||
if Genotype.where(md5sum: md5).where('id != ?', genotype.id).count > 0
|
||||
file_is_duplicate = true
|
||||
logger.info "Genotyping #{genotype.genotype.path} is already uploaded!\n"
|
||||
logger.info "Genotyping #{g.fs_filename} has the same md5sum.\n"
|
||||
file_is_ok = false
|
||||
file_is_duplicate = true
|
||||
end
|
||||
|
||||
|
||||
# not proper file!
|
||||
if not file_is_ok
|
||||
if file_is_duplicate
|
||||
UserMailer.duplicate_file(@genotype.user_id).deliver
|
||||
system("rm #{Rails.root}/public/data/#{@genotype.fs_filename}")
|
||||
Genotype.find_by_id(@genotype.id).delete
|
||||
else
|
||||
UserMailer.parsing_error(@genotype.user_id).deliver
|
||||
log "file is not ok, sending email"
|
||||
# should delete the uploaded file here, leaving that for now
|
||||
# might be better to keep the file for debugging
|
||||
Genotype.find_by_id(@genotype.id).delete
|
||||
end
|
||||
if file_is_duplicate
|
||||
UserMailer.duplicate_file(genotype.user_id).deliver
|
||||
system("rm #{Rails.root}/public/data/#{genotype.fs_filename}")
|
||||
Genotype.find_by_id(genotype.id).delete
|
||||
else
|
||||
UserMailer.parsing_error(genotype.user_id).deliver
|
||||
logger.info "file is not ok, sending email"
|
||||
# should delete the uploaded file here, leaving that for now
|
||||
# might be better to keep the file for debugging
|
||||
Genotype.find_by_id(genotype.id).delete
|
||||
end
|
||||
else
|
||||
log "Updating genotype with md5sum #{md5}"
|
||||
log "Updating genotype #{@genotype.id}"
|
||||
status = @genotype.update_attributes(:md5sum => md5)
|
||||
log "Md5-updating-status is #{status}"
|
||||
logger.info "Updating genotype with md5sum #{md5}"
|
||||
logger.info "Updating genotype #{genotype.id}"
|
||||
status = genotype.update_attributes(:md5sum => md5)
|
||||
logger.info "Md5-updating-status is #{status}"
|
||||
|
||||
system("csplit -k -f #{@genotype.id}_tmpfile -n 4 #{filename} 20000 {2000}")
|
||||
system("mv #{@genotype.id}_tmpfile* tmp/")
|
||||
|
||||
temp_files = Dir.glob("tmp/#{@genotype.id}_tmpfile*")
|
||||
temp_files.each do |single_temp_file|
|
||||
Sidekiq::Client.enqueue(Parsing, @genotype.id, single_temp_file)
|
||||
end
|
||||
Parsing.perform_async(genotype.id)
|
||||
end
|
||||
end
|
||||
def log msg
|
||||
Rails.logger.info "#{DateTime.now}: #{msg}"
|
||||
|
||||
def logger
|
||||
return @logger if @logger
|
||||
@logger = Logger.new(Rails.root.join("log/preparsing_#{Rails.env}.log"))
|
||||
@logger.formatter = Logger::Formatter.new
|
||||
@logger
|
||||
end
|
||||
end
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
class AddDefaultsForFrequenciesToSnps < ActiveRecord::Migration
|
||||
def change
|
||||
change_column :snps, :allele_frequency, :string, default: "---\nA: 0\nT: 0\nG: 0\nC: 0\n"
|
||||
change_column :snps, :genotype_frequency, :string, default: "--- {}\n"
|
||||
change_column :snps, :ranking, :integer, default: 0
|
||||
end
|
||||
end
|
||||
@@ -11,7 +11,7 @@
|
||||
#
|
||||
# It's strongly recommended to check this file into your version control system.
|
||||
|
||||
ActiveRecord::Schema.define(:version => 20140509001806) do
|
||||
ActiveRecord::Schema.define(:version => 20140820071334) do
|
||||
|
||||
create_table "achievements", :force => true do |t|
|
||||
t.text "award"
|
||||
@@ -276,9 +276,9 @@ ActiveRecord::Schema.define(:version => 20140509001806) do
|
||||
t.string "name"
|
||||
t.string "position"
|
||||
t.string "chromosome"
|
||||
t.string "genotype_frequency"
|
||||
t.string "allele_frequency"
|
||||
t.integer "ranking"
|
||||
t.string "genotype_frequency", :default => "--- {}\n"
|
||||
t.string "allele_frequency", :default => "---\nA: 0\nT: 0\nG: 0\nC: 0\n"
|
||||
t.integer "ranking", :default => 0
|
||||
t.integer "number_of_users", :default => 0
|
||||
t.datetime "mendeley_updated", :default => '2011-08-24 03:44:32'
|
||||
t.datetime "plos_updated", :default => '2011-08-24 03:44:32'
|
||||
|
||||
31
db/seeds.rb
31
db/seeds.rb
@@ -6,17 +6,24 @@
|
||||
# cities = City.create([{ :name => 'Chicago' }, { :name => 'Copenhagen' }])
|
||||
# Mayor.create(:name => 'Daley', :city => cities.first)
|
||||
#
|
||||
#User.create(:name => "bla", :email => "abc@def.com", :password => "abc", :password_confirmation => "abc", :has_sequence => true)
|
||||
#User.create(:name => 'bla', :email => 'abc@def.com', :password => 'abc', :password_confirmation => 'abc', :has_sequence => true)
|
||||
|
||||
if Achievement.all.length == 0
|
||||
Achievement.create(:award => "Published genotyping", :short_name => "pub_gen")
|
||||
Achievement.create(:award => "Published 10 Mio. SNPs", :short_name => "10_mio")
|
||||
Achievement.create(:award => "Entered first phenotype", :short_name => "1phen")
|
||||
Achievement.create(:award => "Entered 5 additional phenotypes", :short_name => "5phen")
|
||||
Achievement.create(:award => "Entered 10 additional phenotypes", :short_name => "10phen")
|
||||
Achievement.create(:award => "Entered 20 additional phenotypes", :short_name => "20phen")
|
||||
Achievement.create(:award => "Entered 50 additional phenotypes", :short_name => "50phen")
|
||||
Achievement.create(:award => "Entered 100 additional phenotypes", :short_name => "100phen")
|
||||
Achievement.create(:award => "Created a new phenotype", :short_name => "1addphen")
|
||||
Achievement.create(:award => "Created 5 new phenotypes", :short_name => "5addphen")
|
||||
Achievement.create(:award => "Created 10 new phenotypes", :short_name => "10addphen")
|
||||
time = Time.now.utc
|
||||
# Ths is written in SQL to prevent Solr from indexing... *le sigh*
|
||||
Achievement.connection.execute(<<-SQL)
|
||||
INSERT INTO achievements (award, short_name, created_at, updated_at)
|
||||
VALUES
|
||||
('Published genotyping', 'pub_gen', '#{time.iso8601}', '#{time.iso8601}'),
|
||||
('Published 10 Mio. SNPs', '10_mio', '#{time.iso8601}', '#{time.iso8601}'),
|
||||
('Entered first phenotype', '1phen', '#{time.iso8601}', '#{time.iso8601}'),
|
||||
('Entered 5 additional phenotypes', '5phen', '#{time.iso8601}', '#{time.iso8601}'),
|
||||
('Entered 10 additional phenotypes', '10phen', '#{time.iso8601}', '#{time.iso8601}'),
|
||||
('Entered 20 additional phenotypes', '20phen', '#{time.iso8601}', '#{time.iso8601}'),
|
||||
('Entered 50 additional phenotypes', '50phen', '#{time.iso8601}', '#{time.iso8601}'),
|
||||
('Entered 100 additional phenotypes', '100phen', '#{time.iso8601}', '#{time.iso8601}'),
|
||||
('Created a new phenotype', '1addphen', '#{time.iso8601}', '#{time.iso8601}'),
|
||||
('Created 5 new phenotypes', '5addphen', '#{time.iso8601}', '#{time.iso8601}'),
|
||||
('Created 10 new phenotypes', '10addphen', '#{time.iso8601}', '#{time.iso8601}')
|
||||
SQL
|
||||
end
|
||||
|
||||
197
spec/integration/genotype_parsing_and_deleting_spec.rb
Normal file
197
spec/integration/genotype_parsing_and_deleting_spec.rb
Normal file
@@ -0,0 +1,197 @@
|
||||
require 'spec_helper'
|
||||
|
||||
describe 'genotype parsing', sidekiq: :inline do
|
||||
before do
|
||||
# When running the background jobs inline, Paperclip hasn't saved the file,
|
||||
# yet. So we mock the after create hook and run the job manually.
|
||||
allow_any_instance_of(Genotype).to receive(:parse_genotype)
|
||||
Preparsing.new.perform(genotype.id)
|
||||
end
|
||||
|
||||
after do
|
||||
expect(Genotype.count).to be(1)
|
||||
expect(UserSnp.count).to be(5)
|
||||
expect(Snp.count).to be(5)
|
||||
genotype.destroy
|
||||
expect(Genotype.count).to be_zero
|
||||
expect(UserSnp.count).to be_zero
|
||||
expect(Snp.count).to be_zero
|
||||
end
|
||||
|
||||
context '23andMe' do
|
||||
let(:file) { File.open(Rails.root.join('test/data/23andMe_test.csv')) }
|
||||
let(:genotype) do
|
||||
create(:genotype, genotype: file, filetype: '23andme')
|
||||
end
|
||||
|
||||
it 'parses 23andMe data', truncate: true do
|
||||
# Snp
|
||||
snp_data = Snp.all.map do |s|
|
||||
[s.name, s.position, s.chromosome, s.genotype_frequency,
|
||||
s.allele_frequency, s.ranking]
|
||||
end
|
||||
snp_data = snp_data.sort_by { |s| s[0] }
|
||||
|
||||
expected = [
|
||||
['rs11240777', '788822', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
|
||||
['rs12124819', '766409', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
|
||||
['rs3094315', '742429', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
|
||||
['rs3131972', '742584', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
|
||||
['rs4477212', '72017', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0]
|
||||
]
|
||||
|
||||
expect(snp_data).to match_array(expected)
|
||||
|
||||
# UserSnp
|
||||
user_snps = UserSnp.all
|
||||
user_snp_genotypes = user_snps.map(&:local_genotype)
|
||||
expected_genotypes = %w(AA AA GG AG AG)
|
||||
expect(user_snp_genotypes).to eq(expected_genotypes)
|
||||
user_snps.each do |s|
|
||||
expect(s.genotype_id).to eq(genotype.id)
|
||||
expect(Snp.pluck(:name)).to include(s.snp_name)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'deCODEme' do
|
||||
let(:file) { File.open(Rails.root.join('test/data/deCODEme_test.csv')) }
|
||||
let(:genotype) do
|
||||
create(:genotype, genotype: file, filetype: 'decodeme')
|
||||
end
|
||||
|
||||
it 'parse deCODEme data', truncate: true do
|
||||
# Snp
|
||||
snp_data = Snp.all.map do |s|
|
||||
[s.name, s.position, s.chromosome, s.genotype_frequency,
|
||||
s.allele_frequency, s.ranking, s.user_snps_count]
|
||||
end.sort_by { |s| s[0] }
|
||||
|
||||
expected = [
|
||||
['rs11240767', '718814', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs2185539', '556738', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs3094315', '742429', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs4477212', '72017', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs6681105', '581938', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1]
|
||||
]
|
||||
|
||||
expect(snp_data).to match_array(expected)
|
||||
|
||||
# UserSnp
|
||||
user_snps = UserSnp.all
|
||||
user_snp_genotypes = user_snps.map(&:local_genotype)
|
||||
expected_genotypes = %w(AA CC TT CC TT)
|
||||
expect(user_snp_genotypes).to eq(expected_genotypes)
|
||||
user_snps.each do |s|
|
||||
expect(s.genotype_id).to eq(genotype.id)
|
||||
expect(Snp.pluck(:name)).to include(s.snp_name)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'ancestry' do
|
||||
let(:file) { File.open(Rails.root.join('test/data/ancestry_test.csv')) }
|
||||
let(:genotype) do
|
||||
create(:genotype, genotype: file, filetype: 'ancestry')
|
||||
end
|
||||
|
||||
it 'parse ancestry data', truncate: true do
|
||||
# Snp
|
||||
snp_data = Snp.all.map do |s|
|
||||
[s.name, s.position, s.chromosome, s.genotype_frequency,
|
||||
s.allele_frequency, s.ranking, s.user_snps_count]
|
||||
end.sort_by { |s| s[0] }
|
||||
|
||||
expected = [
|
||||
['rs4477212', '82154', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs3131972', '752721', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs12562034', '768448', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs11240777', '798959', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs6681049', '800007', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1]
|
||||
]
|
||||
|
||||
expect(snp_data).to match_array(expected)
|
||||
|
||||
# UserSnp
|
||||
user_snps = UserSnp.all
|
||||
user_snp_genotypes = user_snps.map(&:local_genotype)
|
||||
expected_genotypes = %w(CC CC CC CC CC)
|
||||
expect(user_snp_genotypes).to eq(expected_genotypes)
|
||||
user_snps.each do |s|
|
||||
expect(s.genotype_id).to eq(genotype.id)
|
||||
expect(Snp.pluck(:name)).to include(s.snp_name)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'ftdna-illumina' do
|
||||
let(:file) { File.open(Rails.root.join('test/data/ftdna-illumina_sample.csv')) }
|
||||
let(:genotype) do
|
||||
create(:genotype, genotype: file, filetype: 'ftdna-illumina')
|
||||
end
|
||||
|
||||
it 'parse ancestry data', truncate: true do
|
||||
# Snp
|
||||
snp_data = Snp.all.map do |s|
|
||||
[s.name, s.position, s.chromosome, s.genotype_frequency,
|
||||
s.allele_frequency, s.ranking, s.user_snps_count]
|
||||
end.sort_by { |s| s[0] }
|
||||
|
||||
expected = [
|
||||
['rs3094315', '752566', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs3131972', '752721', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs12562034', '768448', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs12124819', '776546', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs11240777', '798959', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1]
|
||||
]
|
||||
|
||||
expect(snp_data).to match_array(expected)
|
||||
|
||||
# UserSnp
|
||||
user_snps = UserSnp.all
|
||||
user_snp_genotypes = user_snps.map(&:local_genotype)
|
||||
expected_genotypes = %w(AA GG GG AA AG)
|
||||
expect(user_snp_genotypes).to eq(expected_genotypes)
|
||||
user_snps.each do |s|
|
||||
expect(s.genotype_id).to eq(genotype.id)
|
||||
expect(Snp.pluck(:name)).to include(s.snp_name)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
context 'IYG' do
|
||||
let(:file) { File.open(Rails.root.join('test/data/iyg_sample.csv')) }
|
||||
let(:genotype) do
|
||||
create(:genotype, genotype: file, filetype: 'IYG')
|
||||
end
|
||||
|
||||
it 'parse ancestry data', truncate: true do
|
||||
# Snp
|
||||
snp_data = Snp.all.map do |s|
|
||||
[s.name, s.position, s.chromosome, s.genotype_frequency,
|
||||
s.allele_frequency, s.ranking, s.user_snps_count]
|
||||
end.sort_by { |s| s[0] }
|
||||
|
||||
expected = [
|
||||
['rs2131925', '1', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs2815752', '1', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs10924081', '1', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs199838004', '3027', 'MT', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs41456348', '4336', 'MT', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1]
|
||||
]
|
||||
|
||||
expect(snp_data).to match_array(expected)
|
||||
|
||||
# UserSnp
|
||||
user_snps = UserSnp.all
|
||||
user_snp_genotypes = user_snps.map(&:local_genotype)
|
||||
expected_genotypes = %w(GT AA AA T T)
|
||||
expect(user_snp_genotypes).to eq(expected_genotypes)
|
||||
user_snps.each do |s|
|
||||
expect(s.genotype_id).to eq(genotype.id)
|
||||
expect(Snp.pluck(:name)).to include(s.snp_name)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
@@ -1,100 +0,0 @@
|
||||
require 'spec_helper'
|
||||
|
||||
describe 'genotype parsing' do
|
||||
let(:temp_file) { Rails.root.join('tmp/snp_file.txt') }
|
||||
|
||||
before do
|
||||
allow(Sidekiq::Client).to receive(:enqueue).with(Preparsing, an_instance_of(Fixnum))
|
||||
FileUtils.rm(temp_file) if File.exist?(temp_file)
|
||||
end
|
||||
|
||||
context '23andMe' do
|
||||
let(:file) { Rails.root.join('test/data/23andMe_test.csv') }
|
||||
let(:genotype) do
|
||||
create(:genotype, genotype_file_name: file.basename, filetype: '23andme')
|
||||
end
|
||||
|
||||
it 'parse 23andMe data', truncate: true do
|
||||
FileUtils.cp(file, temp_file)
|
||||
Parsing.new.perform(genotype.id, temp_file)
|
||||
|
||||
# Snp
|
||||
snp_data = Snp.all.map do |s|
|
||||
[s.name, s.position, s.chromosome, s.genotype_frequency,
|
||||
s.allele_frequency, s.ranking]
|
||||
end
|
||||
snp_data = snp_data.sort_by { |s| s[0] }
|
||||
|
||||
expected = [
|
||||
['rs11240777', '788822', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
|
||||
['rs12124819', '766409', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
|
||||
['rs3094315', '742429', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
|
||||
['rs3131972', '742584', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
|
||||
['rs4477212', '72017', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0]
|
||||
]
|
||||
|
||||
expect(snp_data).to eq(expected)
|
||||
|
||||
# UserSnp
|
||||
user_snps = UserSnp.all
|
||||
user_snp_genotypes = user_snps.map(&:local_genotype)
|
||||
expected_genotypes = %w(AA AA GG AG AG)
|
||||
expect(user_snp_genotypes).to eq(expected_genotypes)
|
||||
user_snps.each do |s|
|
||||
expect(s.genotype_id).to eq(genotype.id)
|
||||
expect(Snp.pluck(:name)).to include(s.snp_name)
|
||||
end
|
||||
end
|
||||
|
||||
# could put these deleting tests into their own file;
|
||||
# however, the genotyping exists at this point in time and we don't have to do any extra work
|
||||
# to pull it from the test DB
|
||||
it 'delete data' do
|
||||
DeleteGenotype.new.perform(genotype)
|
||||
expect(Snp.count).to eq(0)
|
||||
end
|
||||
end
|
||||
|
||||
context 'deCODEme' do
|
||||
let(:file) { Rails.root.join('test/data/deCODEme_test.csv') }
|
||||
let(:genotype) do
|
||||
create(:genotype, genotype_file_name: file.basename, filetype: 'decodeme')
|
||||
end
|
||||
|
||||
it 'parse deCODEme data', truncate: true do
|
||||
FileUtils.cp file, temp_file
|
||||
Parsing.new.perform(genotype.id, temp_file)
|
||||
|
||||
# Snp
|
||||
snp_data = Snp.all.map do |s|
|
||||
[s.name, s.position, s.chromosome, s.genotype_frequency,
|
||||
s.allele_frequency, s.ranking, s.user_snps_count]
|
||||
end.sort_by { |s| s[0] }
|
||||
|
||||
expected = [
|
||||
['rs11240767', '718814', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs2185539', '556738', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs3094315', '742429', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs4477212', '72017', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
|
||||
['rs6681105', '581938', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1]
|
||||
]
|
||||
|
||||
expect(snp_data).to eq(expected)
|
||||
|
||||
# UserSnp
|
||||
user_snps = UserSnp.all
|
||||
user_snp_genotypes = user_snps.map(&:local_genotype)
|
||||
expected_genotypes = %w(AA CC TT CC TT)
|
||||
expect(user_snp_genotypes).to eq(expected_genotypes)
|
||||
user_snps.each do |s|
|
||||
expect(s.genotype_id).to eq(genotype.id)
|
||||
expect(Snp.pluck(:name)).to include(s.snp_name)
|
||||
end
|
||||
end
|
||||
|
||||
it 'delete deCODEme data' do
|
||||
DeleteGenotype.new.perform(genotype)
|
||||
expect(Snp.count).to eq(0)
|
||||
end
|
||||
end
|
||||
end
|
||||
17
spec/mailers/user_mailer_spec.rb
Normal file
17
spec/mailers/user_mailer_spec.rb
Normal file
@@ -0,0 +1,17 @@
|
||||
require 'spec_helper'
|
||||
|
||||
describe UserMailer do
|
||||
let(:user) { double(:user, name: 'Lord Schmorgoroth', email: 'ls@example.com') }
|
||||
let(:genotype) { double('genotype', id: 1, user: user, filetype: '23andme') }
|
||||
let(:stats) { { rows_without_comments: 2, rows_after_parsing: 1 } }
|
||||
|
||||
describe '#finished_parsing' do
|
||||
it 'notifies the user about his genotype having been parsed' do
|
||||
expect(Genotype).to receive(:find).with(genotype.id).and_return(genotype)
|
||||
described_class.finished_parsing(genotype.id, stats).deliver
|
||||
mail = ActionMailer::Base.deliveries.last
|
||||
expect(mail.body.raw_source).to include(user.name)
|
||||
expect(mail.body.raw_source).to include('23andMe')
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -7,8 +7,6 @@ require 'factory_girl_rails'
|
||||
require 'sunspot_test/rspec'
|
||||
require 'pry-rails' unless ENV['CI']
|
||||
|
||||
Sidekiq::Testing.inline!
|
||||
|
||||
# Requires supporting ruby files with custom matchers and macros, etc,
|
||||
# in spec/support/ and its subdirectories.
|
||||
Dir[Rails.root.join("spec/support/**/*.rb")].each { |f| require f }
|
||||
@@ -64,4 +62,18 @@ RSpec.configure do |config|
|
||||
config.after(:example) do
|
||||
DatabaseCleaner.clean
|
||||
end
|
||||
|
||||
config.before(:each) do | example |
|
||||
Sidekiq::Worker.clear_all
|
||||
|
||||
if example.metadata[:sidekiq] == :fake
|
||||
Sidekiq::Testing.fake!
|
||||
elsif example.metadata[:sidekiq] == :inline
|
||||
Sidekiq::Testing.inline!
|
||||
elsif example.metadata[:type] == :acceptance
|
||||
Sidekiq::Testing.inline!
|
||||
else
|
||||
Sidekiq::Testing.fake!
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
18
spec/workers/parsing_spec.rb
Normal file
18
spec/workers/parsing_spec.rb
Normal file
@@ -0,0 +1,18 @@
|
||||
require 'spec_helper'
|
||||
|
||||
describe Parsing do
|
||||
describe '#notify_user' do
|
||||
let(:mail) { double('mail') }
|
||||
let(:genotype) { double('genotype', id: 1) }
|
||||
let(:stats) { { foos: 7 } }
|
||||
|
||||
it 'sends an email to the user' do
|
||||
subject.instance_variable_set(:@stats, stats)
|
||||
subject.instance_variable_set(:@genotype, genotype)
|
||||
expect(UserMailer).to receive(:delay).and_return(UserMailer)
|
||||
expect(UserMailer).to receive(:finished_parsing).with(genotype.id, stats)
|
||||
|
||||
subject.notify_user
|
||||
end
|
||||
end
|
||||
end
|
||||
22
test/data/ancestry_test.csv
Normal file
22
test/data/ancestry_test.csv
Normal file
@@ -0,0 +1,22 @@
|
||||
#AncestryDNA raw data download
|
||||
#This file was generated by AncestryDNA at: some_time
|
||||
#Data was collected using AncestryDNA array version: V1.0
|
||||
#Data is formatted using AncestryDNA converter version: V1.0
|
||||
#Below is a text version of your DNA file from Ancestry.com DNA, LLC. THIS
|
||||
#INFORMATION IS FOR YOUR PERSONAL USE AND IS INTENDED FOR GENEALOGICAL RESEARCH
|
||||
#ONLY. IT IS NOT INTENDED FOR MEDICAL OR HEALTH PURPOSES. THE EXPORTED DATA IS
|
||||
#SUBJECT TO THE AncestryDNA TERMS AND CONDITIONS, BUT PLEASE BE AWARE THAT THE
|
||||
#DOWNLOADED DATA WILL NO LONGER BE PROTECTED BY OUR SECURITY MEASURES.
|
||||
#
|
||||
#Genetic data is provided below as five TAB delimited columns. Each line
|
||||
#corresponds to a SNP. Column one provides the SNP identifier (rsID where
|
||||
#possible). Columns two and three contain the chromosome and basepair position
|
||||
#of the SNP using human reference build 37.1 coordinates. Columns four and five
|
||||
#contain the two alleles observed at this SNP (genotype). The genotype is reported
|
||||
#on the forward (+) strand with respect to the human reference.
|
||||
rsid chromosome position allele1 allele2
|
||||
rs4477212 1 82154 C C
|
||||
rs3131972 1 752721 C C
|
||||
rs12562034 1 768448 C C
|
||||
rs11240777 1 798959 C C
|
||||
rs6681049 1 800007 C C
|
||||
|
Can't render this file because it has a wrong number of fields in line 17.
|
@@ -1,3 +1,4 @@
|
||||
Name,Variation,Chromosome,Position,Strand,YourCode
|
||||
rs4477212,A/G,1,72017,+,AA
|
||||
rs2185539,C/T,1,556738,+,CC
|
||||
rs6681105,C/T,1,581938,+,TT
|
||||
|
||||
|
6
test/data/ftdna-illumina_sample.csv
Normal file
6
test/data/ftdna-illumina_sample.csv
Normal file
@@ -0,0 +1,6 @@
|
||||
RSID,CHROMOSOME,POSITION,RESULT
|
||||
"rs3094315","1","752566","AA"
|
||||
"rs3131972","1","752721","GG"
|
||||
"rs12562034","1","768448","GG"
|
||||
"rs12124819","1","776546","AA"
|
||||
"rs11240777","1","798959","AG"
|
||||
|
5
test/data/iyg_sample.csv
Normal file
5
test/data/iyg_sample.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
rs2131925 GT
|
||||
rs2815752 AA
|
||||
rs10924081 AA
|
||||
MT-T3027C T
|
||||
MT-T4336C T
|
||||
|
@@ -14,7 +14,7 @@ unless $factories_already_read
|
||||
|
||||
factory :genotype do
|
||||
genotype_file_name "foo.txt"
|
||||
association :user
|
||||
user
|
||||
end
|
||||
|
||||
factory :snp do
|
||||
|
||||
Reference in New Issue
Block a user