git pushMerge branch 'master' of github.com:gedankenstuecke/snpr

This commit is contained in:
Philipp Bayer
2014-09-01 10:50:03 +10:00
27 changed files with 668 additions and 368 deletions

45
.gitignore vendored
View File

@@ -1,32 +1,31 @@
mail_username.txt
mail_password.txt
public/system/images/*
dump.rdb
sunspot-solr-development.log.*
key_mendeley.txt
key_plos.txt
api_key.txt
.bundle
config/app_config.yml
config/database.yml
config/newrelic.yml
coverage
db/*.sqlite3
log/*.log
tmp/pids/server.pid
tmp/cache
public/data/*23andme*
development.log
dump.rdb
hiredis/
key_mendeley.txt
key_plos.txt
log/
log/*.log
mail_password.txt
mail_username.txt
/public/data/
/public/system/images/
.sass-cache/
secret_token
server.pid
*.swp
solr/data
solr/pids
config/database.yml
sunspot-solr-development.log.*
*.swp
tmp/
tmp/cache
tmp/pids/server.pid
.vagrant
vendor/bundle
vendor/cache
coverage
hiredis/
log/
secret_token
config/app_config.yml
config/newrelic.yml
tmp/
.vagrant
.sass-cache/
public/data/zip/opensnp_datadump.current.zip

View File

@@ -14,8 +14,9 @@ before_script:
- echo "bar" > mail_password.txt
- echo "f7264dd590e09007703723a75550c748" > secret_token
- psql -c 'create database snpr_test;' -U postgres
# script: "bundle install" # this is automatically run by travis-ci
- bundle exec rake db:setup
services:
- redis-server
script: "bundle exec rake --trace db:migrate test"
addons:
postgresql: "9.2"
script: "bundle exec rake test"

View File

@@ -22,11 +22,10 @@ gem 'plos', require: false
# New Relic monitoring, off by default in development
gem 'newrelic_rpm'
# workaround for bug in Fedora
# DB
gem 'pg'
gem 'activerecord-import', '~> 0.2.11'
gem 'composite_primary_keys'
# for solr (indexing, searching)
gem 'sunspot_rails'#, '2.0.0'

View File

@@ -127,6 +127,8 @@ GEM
coffee-script-source
execjs
coffee-script-source (1.6.3)
composite_primary_keys (5.0.14)
activerecord (~> 3.2.0, >= 3.2.9)
connection_pool (1.1.0)
crack (0.4.1)
safe_yaml (~> 0.9.0)
@@ -396,6 +398,7 @@ DEPENDENCIES
capistrano (~> 2.0)
capybara
coffee-script
composite_primary_keys
database_cleaner
devise (= 3.0.0)
dynamic_form

View File

@@ -84,4 +84,19 @@ default :from => "donotreply@opensnp.org"
puts "http://"+ActionMailer::Base.default_url_options[:host]+@link
end
def finished_parsing(genotype_id, stats)
genotype = Genotype.find(genotype_id)
@user = genotype.user
@stats = stats
@vendor = {
'ftdna-illumina' => 'FamilyTreeDNA',
'23andme' => '23andMe',
'IYG' => 'Inside Your Genome',
'decodeme' => 'deCODEme',
'23andme-exome-vcf' => '23andMe',
'ancestry' => 'Ancestry',
}.fetch(genotype.filetype)
mail(to: @user.email, subject: 'Finished parsing your genotyping')
end
end

View File

@@ -1,8 +1,8 @@
class Achievement < ActiveRecord::Base
attr_accessible :award,:short_name
has_many :user_achievements
attr_accessible :award,:short_name
has_many :user_achievements
searchable do
text :award
end
searchable do
text :award
end
end

View File

@@ -26,11 +26,11 @@ class Genotype < ActiveRecord::Base
end
def parse_genotype
Sidekiq::Client.enqueue(Preparsing, id)
Preparsing.perform_async(id)
end
def delete_genotype
Sidekiq::Client.enqueue(DeleteGenotype, { genotype_id: id })
DeleteGenotype.perform_async(genotype_id: id)
end
Paperclip.interpolates :fs_filename do |attachment, style|

View File

@@ -1,6 +1,5 @@
class Snp < ActiveRecord::Base
has_many :user_snps, foreign_key: :snp_name, primary_key: :name,
dependent: :destroy
has_many :user_snps, foreign_key: :snp_name, primary_key: :name
has_many :users, through: :user_snps
has_many :pgp_annotations
has_many :snp_references

View File

@@ -1,6 +1,6 @@
class UserSnp < ActiveRecord::Base
belongs_to :snp, foreign_key: :snp_name, primary_key: :name,
counter_cache: true
self.primary_keys = [:genotype_id, :snp_name]
belongs_to :snp, foreign_key: :snp_name, primary_key: :name, counter_cache: true
has_one :user, through: :genotype
belongs_to :genotype

View File

@@ -6,6 +6,9 @@
<% if @unkown_chromosome[i] == false %>
<SEGMENT id="<%= @id[i] %>" version="1.0" <%if @has_start[i] == true %> start="<%= @start_and_end[i][0] %>" stop="<%= @start_and_end[i][1] %>"<%else%> start="1" stop="" <%end%>>
<% @user_snps[i].each do |us| %>
<% if us.snp.nil? %>
<% next %>
<% end %>
<FEATURE id="<%= us.snp.name %>">
<TYPE id="<%= us.local_genotype %>" />
<METHOD id="" />

View File

@@ -0,0 +1,10 @@
Hello <%= @user.name %>,
I just finished parsing your <%= @vendor %> file. I found a total of
<%= @stats[:rows_without_comments] %> rows (exclusive comments) in it and was
able to import <%= @stats[:rows_after_parsing] %> SNPs.
If this seems way off, feel free to contact us!
Cheers,
The OpenSNP Parser

View File

@@ -11,9 +11,8 @@ class DeleteGenotype
# This user_snp is the only one, so, destroy the Snp,
# which destroys the UserSnp implicitly
us.snp.destroy
else
us.destroy
end
us.destroy
end
end
end

View File

@@ -2,139 +2,227 @@ class Parsing
include Sidekiq::Worker
sidekiq_options :queue => :parse, :retry => 5, :unique => true
def perform(genotype_id, temp_file)
Rails.logger.level = 0
Rails.logger = Logger.new("#{Rails.root}/log/parsing_#{Rails.env}.log")
genotype_id = genotype_id["genotype"]["id"].to_i if genotype_id.is_a?(Hash)
attr_reader :genotype, :temp_table_name, :tempfile, :stats, :start_time
def perform(genotype_id)
@stats = {}
@start_time = Time.current
logger.info("Started parsing genotype with id #{genotype_id}")
@genotype = Genotype.find(genotype_id)
if @genotype.filetype != "other"
# IYG filetype needs proper dbSNP-names
if @genotype.filetype == "IYG"
db_snp_snps = {"MT-T3027C"=>"rs199838004", "MT-T4336C"=>"rs41456348", "MT-G4580A"=>"rs28357975", "MT-T5004C"=>"rs41419549", "MT-C5178a"=>"rs28357984", "MT-A5390G"=>"rs41333444", "MT-C6371T"=>"rs41366755", "MT-G8697A"=>"rs28358886", "MT-G9477A"=>"rs2853825", "MT-G10310A"=>"rs41467651", "MT-A10550G"=>"rs28358280", "MT-C10873T"=>"rs2857284", "MT-C11332T"=>"rs55714831", "MT-A11947G"=>"rs28359168", "MT-A12308G"=>"rs2853498", "MT-A12612G"=>"rs28359172", "MT-T14318C"=>"rs28357675", "MT-T14766C"=>"rs3135031", "MT-T14783C"=>"rs28357680"}
end
stats[:filetype] = genotype.filetype
stats[:genotype_id] = genotype.id
@temp_table_name = "user_snps_temp_#{genotype.id}"
@tempfile = Tempfile.new("snpr_genotype_#{genotype.id}_")
genotype_file = File.open(temp_file, "r")
log "Loading known Snps."
known_snps = Snp.pluck(:name).to_set
user_genotype_ids = @genotype.user.genotypes.pluck(:id)
known_user_snps = UserSnp.where(genotype_id: user_genotype_ids).
pluck('distinct(snp_name)').to_set
new_snps = []
new_user_snps = []
create_temp_table
normalize_csv
copy_csv_into_temp_table
insert_into_snps
insert_into_user_snps
notify_user
log "Parsing file #{temp_file}"
# open that file, go through each line
genotype_file.each do |single_snp|
next if single_snp[0] == "#"
logger.info("Finished parsing genotype with id #{genotype.id}, cleaning up.")
rescue => e
logger.error("Failed with #{e.class}: #{e.message}")
raise
ensure
drop_temp_table
# TODO: Why doesn't `tempfile.unlink` work here?
File.delete(tempfile.path)
stats[:duration] = "#{(Time.current - start_time).round(3)}s"
logger.info("Stats: #{stats.to_a.map { |s| s.join('=') }.join(', ')}")
end
# make a nice array if line is no comment
if @genotype.filetype == "IYG"
prior_snp_array = single_snp.gsub("\n","").split("\t")
name = prior_snp_array[0]
if name.starts_with? "MT"
# check whether it's in db_snp_snps, use that name
position = name.tr('A-Za-z-','') # MT-G1234G -> 1234
def create_temp_table
execute("drop table if exists #{temp_table_name}")
execute(<<-SQL)
create table #{temp_table_name} (
genotype_id int,
snp_name varchar(32),
chromosome varchar(32),
position varchar(32),
local_genotype char(2)
)
SQL
end
if db_snp_snps[name] # do we have a dbSNP-name?
name = db_snp_snps[name]
end
snp_array = [name, "MT", position, prior_snp_array[1]]
else
snp_array = [prior_snp_array[0], "1", "1", prior_snp_array[1]]
end
log "SNP_ARRAY IS"
log snp_array
elsif @genotype.filetype == "23andme"
snp_array = single_snp.split("\t")
elsif @genotype.filetype == "ancestry"
temp_array = single_snp.split("\t")
if temp_array[0] != "rsid"
snp_array = [temp_array[0],temp_array[1],temp_array[2],temp_array[3]+temp_array[4]]
else
next
end
elsif @genotype.filetype == "decodeme"
temp_array = single_snp.split(",")
if temp_array[0] != "Name"
snp_array = [temp_array[0],temp_array[2],temp_array[3],temp_array[5]]
else
next
end
elsif @genotype.filetype == "ftdna-illumina"
temp_array = single_snp.split("\",\"")
if temp_array[0].index("RSID") == nil
if temp_array[0] != nil and temp_array[1] != nil and temp_array[2] != nil and temp_array[3] != nil
snp_array = [temp_array[0].gsub("\"",""),temp_array[1].gsub("\"",""),temp_array[2].gsub("\"",""),temp_array[3].gsub("\"","").rstrip]
else
UserMailer.parsing_error(@genotype.user_id).deliver
break
end
else
next
end
elsif @genotype.filetype == "23andme-exome-vcf"
temp_array = single_snp.split("\t")
@format_array = temp_array[-2].split(":")
@format_array.each_with_index do |element,index|
if element == "GT"
@genotype_position = index
end
end
@genotype_non_parsed = temp_array[-1].split(":")[@genotype_position].split("/")
@genotype_parsed = ""
@genotype_non_parsed.each do |allele|
if allele == "0"
@genotype_parsed = @genotype_parsed + temp_array[3]
elsif allele == "1"
@genotype_parsed = @genotype_parsed + temp_array[4]
end
end
snp_array = [temp_array[2].downcase,temp_array[0],temp_array[1],@genotype_parsed.upcase]
unless known_snps.include?(snp_array[0].downcase)
next
end
end
def drop_temp_table
execute("drop table #{temp_table_name}")
end
if snp_array[0] && snp_array[1] && snp_array[2] && snp_array[3] && snp_array[3].strip.length == 2
# if we do not have the fitting SNP, make one and parse all paper-types for it
unless known_snps.include?(snp_array[0].downcase)
snp = Snp.new(:name => snp_array[0].downcase, :chromosome => snp_array[1], :position => snp_array[2], :ranking => 0, :user_snps_count => 1)
snp.default_frequencies
new_snps << snp
end
if known_user_snps.include?(snp_array[0].downcase)
log "already known user-snp"
else
new_user_snps << [ @genotype.id, snp_array[0].downcase, snp_array[3].rstrip ]
end
else
UserMailer.parsing_error(@genotype.user_id).deliver
break
end
end
log "Importing #{new_snps.length} new Snps"
Snp.import new_snps
def normalize_csv
rows = File.readlines(genotype.genotype.path)
.reject { |line| line.start_with?('#') } # Skip comments
stats[:rows_without_comments] = rows.length
csv = send(:"parse_#{genotype.filetype.sub('-', '_').downcase}", rows)
known_chromosomes = ['MT', 'X', 'Y', (1..22).map(&:to_s)].flatten
csv.select! do |row|
# snp name
row[1].present? &&
# chromosome
known_chromosomes.include?(row[2]) &&
# position
row[3].to_i >= 1 && row[3].to_i <= 249_250_621 &&
# local genotype
row[4].is_a?(String) && (1..2).include?(row[4].length)
end
stats[:rows_after_parsing] = csv.length
tempfile.write(csv.map { |row| row.join(',') }.join("\n"))
tempfile.close
FileUtils.chmod(0644, tempfile.path)
end
log "Importing new UserSnps"
user_snp_columns = [:genotype_id, :snp_name, :local_genotype]
UserSnp.import user_snp_columns, new_user_snps, validate: false
log "Done."
puts "done with #{temp_file}"
system("rm #{temp_file}")
def copy_csv_into_temp_table
execute(<<-SQL)
copy #{temp_table_name} (
genotype_id,
snp_name,
chromosome,
position,
local_genotype
)
from '#{tempfile.path}'
with (FORMAT CSV, HEADER FALSE, DELIMITER ',')
SQL
end
def insert_into_snps
time = Time.now.utc.iso8601
execute(<<-SQL)
insert into snps (name, chromosome, position, created_at, updated_at, user_snps_count)
(
select
#{temp_table_name}.snp_name,
#{temp_table_name}.chromosome,
#{temp_table_name}.position,
'#{time}',
'#{time}',
1
from #{temp_table_name}
left join snps
on #{temp_table_name}.snp_name = snps.name
where
snps.name is null
)
SQL
end
def insert_into_user_snps
execute(<<-SQL)
insert into user_snps (snp_name, local_genotype, genotype_id)
(
select
#{temp_table_name}.snp_name,
#{temp_table_name}.local_genotype,
#{temp_table_name}.genotype_id
from #{temp_table_name}
left join user_snps
on user_snps.snp_name = #{temp_table_name}.snp_name
and user_snps.genotype_id = #{temp_table_name}.genotype_id
where user_snps.snp_name is null
)
SQL
end
def parse_23andme(rows)
rows.map do |row|
fields = row.strip.split("\t")
[
genotype.id,
fields[0],
fields[1],
fields[2],
fields[3]
]
end
end
def log msg
Rails.logger.info "#{DateTime.now}: #{msg}"
def parse_decodeme(rows)
rows.shift if rows.first.start_with?('Name')
rows.map do |row|
fields = row.strip.split(',')
[
genotype.id,
fields[0],
fields[2],
fields[3],
fields[5]
]
end
end
def parse_ancestry(rows)
rows.shift if rows.first.start_with?('rsid')
rows.map do |row|
fields = row.strip.split("\t")
[
genotype.id,
fields[0],
fields[1],
fields[2],
"#{fields[3]}#{fields[4]}"
]
end
end
def parse_ftdna_illumina(rows)
rows.shift if rows.first.start_with?('RSID')
rows.map do |row|
fields = row.strip.split(',')
[
genotype.id,
fields[0].gsub('"', ''),
fields[1].gsub('"', ''),
fields[2].gsub('"', ''),
fields[3].gsub('"', '')
]
end
end
def parse_iyg(rows)
db_snp_names = {
"MT-T3027C" => "rs199838004", "MT-T4336C" => "rs41456348",
"MT-G4580A" => "rs28357975", "MT-T5004C" => "rs41419549",
"MT-C5178a" => "rs28357984", "MT-A5390G" => "rs41333444",
"MT-C6371T" => "rs41366755", "MT-G8697A" => "rs28358886",
"MT-G9477A" => "rs2853825", "MT-G10310A" => "rs41467651",
"MT-A10550G" => "rs28358280", "MT-C10873T" => "rs2857284",
"MT-C11332T" => "rs55714831", "MT-A11947G" => "rs28359168",
"MT-A12308G" => "rs2853498", "MT-A12612G" => "rs28359172",
"MT-T14318C" => "rs28357675", "MT-T14766C" => "rs3135031",
"MT-T14783C" => "rs28357680"
}
rows.map do |row|
snp_name, local_genotype = row.split("\t")
if snp_name.start_with?('MT')
position = snp_name[/[0-9]+/]
chromosome = 'MT'
else
position = chromosome = '1'
end
[
genotype.id,
db_snp_names.fetch(snp_name, snp_name),
chromosome,
position,
local_genotype.strip
]
end
end
def notify_user
UserMailer.delay.finished_parsing(genotype.id, stats)
end
def execute(sql)
Genotype.connection.execute(sql)
end
def logger
return @logger if @logger
@logger = Logger.new(Rails.root.join("log/parsing_#{Rails.env}.log"))
@logger.formatter = Logger::Formatter.new
@logger
end
end

View File

@@ -6,17 +6,13 @@ class Preparsing
sidekiq_options :queue => :preparse, :retry => 10, :unique => true # only retry 10 times - after that, the genotyping probably has already been deleted
def perform(genotype_id)
Rails.logger.level = 0
Rails.logger = Logger.new("#{Rails.root}/log/preparsing_#{Rails.env}.log")
genotype_id = genotype_id["genotype"]["id"].to_i if genotype_id.is_a?(Hash)
@genotype = Genotype.find(genotype_id)
filename = "#{Rails.root}/public/data/#{@genotype.fs_filename}"
log "Starting preparse"
genotype = Genotype.find(genotype_id)
logger.info "Starting preparse"
biggest = ''
biggest_size = 0
begin
Zip::File.open(filename) do |zipfile|
Zip::File.open(genotype.genotype.path) do |zipfile|
# find the biggest file, since that's going to be the genotyping
zipfile.each do |entry|
if entry.size > biggest_size
@@ -24,108 +20,104 @@ class Preparsing
biggest_size = entry.size
end
end
zipfile.extract(biggest,"#{Rails.root}/tmp/#{@genotype.fs_filename}.csv")
system("mv #{Rails.root}/tmp/#{@genotype.fs_filename}.csv #{Rails.root}/public/data/#{@genotype.fs_filename}")
log "copied file"
zipfile.extract(biggest,"#{Rails.root}/tmp/#{genotype.fs_filename}.csv")
system("mv #{Rails.root}/tmp/#{genotype.fs_filename}.csv #{Rails.root}/public/data/#{genotype.fs_filename}")
logger.info "copied file"
end
rescue
log "nothing to unzip, seems to be a text-file in the first place"
logger.info "nothing to unzip, seems to be a text-file in the first place"
end
# now that they are unzipped, check if they're actually proper files
file_is_ok = false
fh = File.open("#{Rails.root}/public/data/#{@genotype.fs_filename}")
fh = File.open(genotype.genotype.path)
l = fh.readline()
# some files, for some reason, start with the UTF-BOM-marker
l = l.sub("\uFEFF","")
# iterate as long as there's commenting going on
while l.start_with?("#")
l = fh.readline()
l = l.sub("\uFEFF","")
l = fh.readline()
l = l.sub("\uFEFF","")
end
if @genotype.filetype == "23andme"
# first non-comment line is of length 4 after split
if l.split("\t").length == 4
log "file is 23andme and is ok!"
file_is_ok = true
end
elsif @genotype.filetype == "ancestry"
if genotype.filetype == "23andme"
# first non-comment line is of length 4 after split
if l.split("\t").length == 4
logger.info "file is 23andme and is ok!"
file_is_ok = true
end
elsif genotype.filetype == "ancestry"
# first line is of length 5
if l.split("\t").length == 5
file_is_ok = true
log "file is ancestry and is ok!"
file_is_ok = true
logger.info "file is ancestry and is ok!"
end
elsif genotype.filetype == "decodeme"
# first line is of length 6
if l.split(",").length == 6
file_is_ok = true
logger.info "file is decodeme and is ok!"
end
elsif genotype.filetype == "ftdna-illumina"
# first line is of length 4
if l.split(",").length == 4
file_is_ok = true
logger.info "file is ftdna and is ok!"
end
elsif genotype.filetype == "23andme-exome-vcf"
#first line is
if l.split("\t").length == 10
file_is_ok = true
logger.info "file is 23andme-exome and is ok!"
end
elsif genotype.filetype == "IYG"
if l.split("\t").length == 2
file_is_ok = true
logger.info "file is IYG and is ok!"
end
elsif @genotype.filetype == "decodeme"
# first line is of length 6
if l.split(",").length == 6
file_is_ok = true
log "file is decodeme and is ok!"
end
elsif @genotype.filetype == "ftdna-illumina"
# first line is of length 4
if l.split(",").length == 4
file_is_ok = true
log "file is ftdna and is ok!"
end
elsif @genotype.filetype == "23andme-exome-vcf"
#first line is
if l.split("\t").length == 10
file_is_ok = true
log "file is 23andme-exome and is ok!"
end
elsif @genotype.filetype == "IYG"
if l.split("\t").length == 2
file_is_ok = true
log "file is IYG and is ok!"
end
end
log "Checking whether genotyping is duplicate"
md5 = Digest::MD5.file("#{Rails.root}/public/data/#{@genotype.fs_filename}").to_s
logger.info "Checking whether genotyping is duplicate"
md5 = Digest::MD5.file("#{Rails.root}/public/data/#{genotype.fs_filename}").to_s
file_is_duplicate = false
Genotype.all.each do |g|
other_md5 = g.md5sum
if other_md5 == md5 and g.id != @genotype.id
log "Genotyping #{filename} is already uploaded!\n"
log "Genotyping #{g.fs_filename} has the same md5sum.\n"
file_is_ok = false
file_is_duplicate = true
end
if Genotype.where(md5sum: md5).where('id != ?', genotype.id).count > 0
file_is_duplicate = true
logger.info "Genotyping #{genotype.genotype.path} is already uploaded!\n"
logger.info "Genotyping #{g.fs_filename} has the same md5sum.\n"
file_is_ok = false
file_is_duplicate = true
end
# not proper file!
if not file_is_ok
if file_is_duplicate
UserMailer.duplicate_file(@genotype.user_id).deliver
system("rm #{Rails.root}/public/data/#{@genotype.fs_filename}")
Genotype.find_by_id(@genotype.id).delete
else
UserMailer.parsing_error(@genotype.user_id).deliver
log "file is not ok, sending email"
# should delete the uploaded file here, leaving that for now
# might be better to keep the file for debugging
Genotype.find_by_id(@genotype.id).delete
end
if file_is_duplicate
UserMailer.duplicate_file(genotype.user_id).deliver
system("rm #{Rails.root}/public/data/#{genotype.fs_filename}")
Genotype.find_by_id(genotype.id).delete
else
UserMailer.parsing_error(genotype.user_id).deliver
logger.info "file is not ok, sending email"
# should delete the uploaded file here, leaving that for now
# might be better to keep the file for debugging
Genotype.find_by_id(genotype.id).delete
end
else
log "Updating genotype with md5sum #{md5}"
log "Updating genotype #{@genotype.id}"
status = @genotype.update_attributes(:md5sum => md5)
log "Md5-updating-status is #{status}"
logger.info "Updating genotype with md5sum #{md5}"
logger.info "Updating genotype #{genotype.id}"
status = genotype.update_attributes(:md5sum => md5)
logger.info "Md5-updating-status is #{status}"
system("csplit -k -f #{@genotype.id}_tmpfile -n 4 #{filename} 20000 {2000}")
system("mv #{@genotype.id}_tmpfile* tmp/")
temp_files = Dir.glob("tmp/#{@genotype.id}_tmpfile*")
temp_files.each do |single_temp_file|
Sidekiq::Client.enqueue(Parsing, @genotype.id, single_temp_file)
end
Parsing.perform_async(genotype.id)
end
end
def log msg
Rails.logger.info "#{DateTime.now}: #{msg}"
def logger
return @logger if @logger
@logger = Logger.new(Rails.root.join("log/preparsing_#{Rails.env}.log"))
@logger.formatter = Logger::Formatter.new
@logger
end
end

View File

@@ -0,0 +1,7 @@
class AddDefaultsForFrequenciesToSnps < ActiveRecord::Migration
def change
change_column :snps, :allele_frequency, :string, default: "---\nA: 0\nT: 0\nG: 0\nC: 0\n"
change_column :snps, :genotype_frequency, :string, default: "--- {}\n"
change_column :snps, :ranking, :integer, default: 0
end
end

View File

@@ -11,7 +11,7 @@
#
# It's strongly recommended to check this file into your version control system.
ActiveRecord::Schema.define(:version => 20140509001806) do
ActiveRecord::Schema.define(:version => 20140820071334) do
create_table "achievements", :force => true do |t|
t.text "award"
@@ -276,9 +276,9 @@ ActiveRecord::Schema.define(:version => 20140509001806) do
t.string "name"
t.string "position"
t.string "chromosome"
t.string "genotype_frequency"
t.string "allele_frequency"
t.integer "ranking"
t.string "genotype_frequency", :default => "--- {}\n"
t.string "allele_frequency", :default => "---\nA: 0\nT: 0\nG: 0\nC: 0\n"
t.integer "ranking", :default => 0
t.integer "number_of_users", :default => 0
t.datetime "mendeley_updated", :default => '2011-08-24 03:44:32'
t.datetime "plos_updated", :default => '2011-08-24 03:44:32'

View File

@@ -6,17 +6,24 @@
# cities = City.create([{ :name => 'Chicago' }, { :name => 'Copenhagen' }])
# Mayor.create(:name => 'Daley', :city => cities.first)
#
#User.create(:name => "bla", :email => "abc@def.com", :password => "abc", :password_confirmation => "abc", :has_sequence => true)
#User.create(:name => 'bla', :email => 'abc@def.com', :password => 'abc', :password_confirmation => 'abc', :has_sequence => true)
if Achievement.all.length == 0
Achievement.create(:award => "Published genotyping", :short_name => "pub_gen")
Achievement.create(:award => "Published 10 Mio. SNPs", :short_name => "10_mio")
Achievement.create(:award => "Entered first phenotype", :short_name => "1phen")
Achievement.create(:award => "Entered 5 additional phenotypes", :short_name => "5phen")
Achievement.create(:award => "Entered 10 additional phenotypes", :short_name => "10phen")
Achievement.create(:award => "Entered 20 additional phenotypes", :short_name => "20phen")
Achievement.create(:award => "Entered 50 additional phenotypes", :short_name => "50phen")
Achievement.create(:award => "Entered 100 additional phenotypes", :short_name => "100phen")
Achievement.create(:award => "Created a new phenotype", :short_name => "1addphen")
Achievement.create(:award => "Created 5 new phenotypes", :short_name => "5addphen")
Achievement.create(:award => "Created 10 new phenotypes", :short_name => "10addphen")
time = Time.now.utc
# Ths is written in SQL to prevent Solr from indexing... *le sigh*
Achievement.connection.execute(<<-SQL)
INSERT INTO achievements (award, short_name, created_at, updated_at)
VALUES
('Published genotyping', 'pub_gen', '#{time.iso8601}', '#{time.iso8601}'),
('Published 10 Mio. SNPs', '10_mio', '#{time.iso8601}', '#{time.iso8601}'),
('Entered first phenotype', '1phen', '#{time.iso8601}', '#{time.iso8601}'),
('Entered 5 additional phenotypes', '5phen', '#{time.iso8601}', '#{time.iso8601}'),
('Entered 10 additional phenotypes', '10phen', '#{time.iso8601}', '#{time.iso8601}'),
('Entered 20 additional phenotypes', '20phen', '#{time.iso8601}', '#{time.iso8601}'),
('Entered 50 additional phenotypes', '50phen', '#{time.iso8601}', '#{time.iso8601}'),
('Entered 100 additional phenotypes', '100phen', '#{time.iso8601}', '#{time.iso8601}'),
('Created a new phenotype', '1addphen', '#{time.iso8601}', '#{time.iso8601}'),
('Created 5 new phenotypes', '5addphen', '#{time.iso8601}', '#{time.iso8601}'),
('Created 10 new phenotypes', '10addphen', '#{time.iso8601}', '#{time.iso8601}')
SQL
end

View File

@@ -0,0 +1,197 @@
require 'spec_helper'
describe 'genotype parsing', sidekiq: :inline do
before do
# When running the background jobs inline, Paperclip hasn't saved the file,
# yet. So we mock the after create hook and run the job manually.
allow_any_instance_of(Genotype).to receive(:parse_genotype)
Preparsing.new.perform(genotype.id)
end
after do
expect(Genotype.count).to be(1)
expect(UserSnp.count).to be(5)
expect(Snp.count).to be(5)
genotype.destroy
expect(Genotype.count).to be_zero
expect(UserSnp.count).to be_zero
expect(Snp.count).to be_zero
end
context '23andMe' do
let(:file) { File.open(Rails.root.join('test/data/23andMe_test.csv')) }
let(:genotype) do
create(:genotype, genotype: file, filetype: '23andme')
end
it 'parses 23andMe data', truncate: true do
# Snp
snp_data = Snp.all.map do |s|
[s.name, s.position, s.chromosome, s.genotype_frequency,
s.allele_frequency, s.ranking]
end
snp_data = snp_data.sort_by { |s| s[0] }
expected = [
['rs11240777', '788822', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
['rs12124819', '766409', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
['rs3094315', '742429', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
['rs3131972', '742584', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
['rs4477212', '72017', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0]
]
expect(snp_data).to match_array(expected)
# UserSnp
user_snps = UserSnp.all
user_snp_genotypes = user_snps.map(&:local_genotype)
expected_genotypes = %w(AA AA GG AG AG)
expect(user_snp_genotypes).to eq(expected_genotypes)
user_snps.each do |s|
expect(s.genotype_id).to eq(genotype.id)
expect(Snp.pluck(:name)).to include(s.snp_name)
end
end
end
context 'deCODEme' do
let(:file) { File.open(Rails.root.join('test/data/deCODEme_test.csv')) }
let(:genotype) do
create(:genotype, genotype: file, filetype: 'decodeme')
end
it 'parse deCODEme data', truncate: true do
# Snp
snp_data = Snp.all.map do |s|
[s.name, s.position, s.chromosome, s.genotype_frequency,
s.allele_frequency, s.ranking, s.user_snps_count]
end.sort_by { |s| s[0] }
expected = [
['rs11240767', '718814', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs2185539', '556738', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs3094315', '742429', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs4477212', '72017', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs6681105', '581938', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1]
]
expect(snp_data).to match_array(expected)
# UserSnp
user_snps = UserSnp.all
user_snp_genotypes = user_snps.map(&:local_genotype)
expected_genotypes = %w(AA CC TT CC TT)
expect(user_snp_genotypes).to eq(expected_genotypes)
user_snps.each do |s|
expect(s.genotype_id).to eq(genotype.id)
expect(Snp.pluck(:name)).to include(s.snp_name)
end
end
end
context 'ancestry' do
let(:file) { File.open(Rails.root.join('test/data/ancestry_test.csv')) }
let(:genotype) do
create(:genotype, genotype: file, filetype: 'ancestry')
end
it 'parse ancestry data', truncate: true do
# Snp
snp_data = Snp.all.map do |s|
[s.name, s.position, s.chromosome, s.genotype_frequency,
s.allele_frequency, s.ranking, s.user_snps_count]
end.sort_by { |s| s[0] }
expected = [
['rs4477212', '82154', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs3131972', '752721', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs12562034', '768448', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs11240777', '798959', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs6681049', '800007', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1]
]
expect(snp_data).to match_array(expected)
# UserSnp
user_snps = UserSnp.all
user_snp_genotypes = user_snps.map(&:local_genotype)
expected_genotypes = %w(CC CC CC CC CC)
expect(user_snp_genotypes).to eq(expected_genotypes)
user_snps.each do |s|
expect(s.genotype_id).to eq(genotype.id)
expect(Snp.pluck(:name)).to include(s.snp_name)
end
end
end
context 'ftdna-illumina' do
let(:file) { File.open(Rails.root.join('test/data/ftdna-illumina_sample.csv')) }
let(:genotype) do
create(:genotype, genotype: file, filetype: 'ftdna-illumina')
end
it 'parse ancestry data', truncate: true do
# Snp
snp_data = Snp.all.map do |s|
[s.name, s.position, s.chromosome, s.genotype_frequency,
s.allele_frequency, s.ranking, s.user_snps_count]
end.sort_by { |s| s[0] }
expected = [
['rs3094315', '752566', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs3131972', '752721', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs12562034', '768448', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs12124819', '776546', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs11240777', '798959', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1]
]
expect(snp_data).to match_array(expected)
# UserSnp
user_snps = UserSnp.all
user_snp_genotypes = user_snps.map(&:local_genotype)
expected_genotypes = %w(AA GG GG AA AG)
expect(user_snp_genotypes).to eq(expected_genotypes)
user_snps.each do |s|
expect(s.genotype_id).to eq(genotype.id)
expect(Snp.pluck(:name)).to include(s.snp_name)
end
end
end
context 'IYG' do
let(:file) { File.open(Rails.root.join('test/data/iyg_sample.csv')) }
let(:genotype) do
create(:genotype, genotype: file, filetype: 'IYG')
end
it 'parse ancestry data', truncate: true do
# Snp
snp_data = Snp.all.map do |s|
[s.name, s.position, s.chromosome, s.genotype_frequency,
s.allele_frequency, s.ranking, s.user_snps_count]
end.sort_by { |s| s[0] }
expected = [
['rs2131925', '1', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs2815752', '1', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs10924081', '1', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs199838004', '3027', 'MT', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs41456348', '4336', 'MT', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1]
]
expect(snp_data).to match_array(expected)
# UserSnp
user_snps = UserSnp.all
user_snp_genotypes = user_snps.map(&:local_genotype)
expected_genotypes = %w(GT AA AA T T)
expect(user_snp_genotypes).to eq(expected_genotypes)
user_snps.each do |s|
expect(s.genotype_id).to eq(genotype.id)
expect(Snp.pluck(:name)).to include(s.snp_name)
end
end
end
end

View File

@@ -1,100 +0,0 @@
require 'spec_helper'
describe 'genotype parsing' do
let(:temp_file) { Rails.root.join('tmp/snp_file.txt') }
before do
allow(Sidekiq::Client).to receive(:enqueue).with(Preparsing, an_instance_of(Fixnum))
FileUtils.rm(temp_file) if File.exist?(temp_file)
end
context '23andMe' do
let(:file) { Rails.root.join('test/data/23andMe_test.csv') }
let(:genotype) do
create(:genotype, genotype_file_name: file.basename, filetype: '23andme')
end
it 'parse 23andMe data', truncate: true do
FileUtils.cp(file, temp_file)
Parsing.new.perform(genotype.id, temp_file)
# Snp
snp_data = Snp.all.map do |s|
[s.name, s.position, s.chromosome, s.genotype_frequency,
s.allele_frequency, s.ranking]
end
snp_data = snp_data.sort_by { |s| s[0] }
expected = [
['rs11240777', '788822', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
['rs12124819', '766409', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
['rs3094315', '742429', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
['rs3131972', '742584', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0],
['rs4477212', '72017', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0]
]
expect(snp_data).to eq(expected)
# UserSnp
user_snps = UserSnp.all
user_snp_genotypes = user_snps.map(&:local_genotype)
expected_genotypes = %w(AA AA GG AG AG)
expect(user_snp_genotypes).to eq(expected_genotypes)
user_snps.each do |s|
expect(s.genotype_id).to eq(genotype.id)
expect(Snp.pluck(:name)).to include(s.snp_name)
end
end
# could put these deleting tests into their own file;
# however, the genotyping exists at this point in time and we don't have to do any extra work
# to pull it from the test DB
it 'delete data' do
DeleteGenotype.new.perform(genotype)
expect(Snp.count).to eq(0)
end
end
context 'deCODEme' do
let(:file) { Rails.root.join('test/data/deCODEme_test.csv') }
let(:genotype) do
create(:genotype, genotype_file_name: file.basename, filetype: 'decodeme')
end
it 'parse deCODEme data', truncate: true do
FileUtils.cp file, temp_file
Parsing.new.perform(genotype.id, temp_file)
# Snp
snp_data = Snp.all.map do |s|
[s.name, s.position, s.chromosome, s.genotype_frequency,
s.allele_frequency, s.ranking, s.user_snps_count]
end.sort_by { |s| s[0] }
expected = [
['rs11240767', '718814', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs2185539', '556738', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs3094315', '742429', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs4477212', '72017', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1],
['rs6681105', '581938', '1', {}, { 'A' => 0, 'T' => 0, 'G' => 0, 'C' => 0 }, 0, 1]
]
expect(snp_data).to eq(expected)
# UserSnp
user_snps = UserSnp.all
user_snp_genotypes = user_snps.map(&:local_genotype)
expected_genotypes = %w(AA CC TT CC TT)
expect(user_snp_genotypes).to eq(expected_genotypes)
user_snps.each do |s|
expect(s.genotype_id).to eq(genotype.id)
expect(Snp.pluck(:name)).to include(s.snp_name)
end
end
it 'delete deCODEme data' do
DeleteGenotype.new.perform(genotype)
expect(Snp.count).to eq(0)
end
end
end

View File

@@ -0,0 +1,17 @@
require 'spec_helper'
describe UserMailer do
let(:user) { double(:user, name: 'Lord Schmorgoroth', email: 'ls@example.com') }
let(:genotype) { double('genotype', id: 1, user: user, filetype: '23andme') }
let(:stats) { { rows_without_comments: 2, rows_after_parsing: 1 } }
describe '#finished_parsing' do
it 'notifies the user about his genotype having been parsed' do
expect(Genotype).to receive(:find).with(genotype.id).and_return(genotype)
described_class.finished_parsing(genotype.id, stats).deliver
mail = ActionMailer::Base.deliveries.last
expect(mail.body.raw_source).to include(user.name)
expect(mail.body.raw_source).to include('23andMe')
end
end
end

View File

@@ -7,8 +7,6 @@ require 'factory_girl_rails'
require 'sunspot_test/rspec'
require 'pry-rails' unless ENV['CI']
Sidekiq::Testing.inline!
# Requires supporting ruby files with custom matchers and macros, etc,
# in spec/support/ and its subdirectories.
Dir[Rails.root.join("spec/support/**/*.rb")].each { |f| require f }
@@ -64,4 +62,18 @@ RSpec.configure do |config|
config.after(:example) do
DatabaseCleaner.clean
end
config.before(:each) do | example |
Sidekiq::Worker.clear_all
if example.metadata[:sidekiq] == :fake
Sidekiq::Testing.fake!
elsif example.metadata[:sidekiq] == :inline
Sidekiq::Testing.inline!
elsif example.metadata[:type] == :acceptance
Sidekiq::Testing.inline!
else
Sidekiq::Testing.fake!
end
end
end

View File

@@ -0,0 +1,18 @@
require 'spec_helper'
describe Parsing do
describe '#notify_user' do
let(:mail) { double('mail') }
let(:genotype) { double('genotype', id: 1) }
let(:stats) { { foos: 7 } }
it 'sends an email to the user' do
subject.instance_variable_set(:@stats, stats)
subject.instance_variable_set(:@genotype, genotype)
expect(UserMailer).to receive(:delay).and_return(UserMailer)
expect(UserMailer).to receive(:finished_parsing).with(genotype.id, stats)
subject.notify_user
end
end
end

View File

@@ -0,0 +1,22 @@
#AncestryDNA raw data download
#This file was generated by AncestryDNA at: some_time
#Data was collected using AncestryDNA array version: V1.0
#Data is formatted using AncestryDNA converter version: V1.0
#Below is a text version of your DNA file from Ancestry.com DNA, LLC. THIS
#INFORMATION IS FOR YOUR PERSONAL USE AND IS INTENDED FOR GENEALOGICAL RESEARCH
#ONLY. IT IS NOT INTENDED FOR MEDICAL OR HEALTH PURPOSES. THE EXPORTED DATA IS
#SUBJECT TO THE AncestryDNA TERMS AND CONDITIONS, BUT PLEASE BE AWARE THAT THE
#DOWNLOADED DATA WILL NO LONGER BE PROTECTED BY OUR SECURITY MEASURES.
#
#Genetic data is provided below as five TAB delimited columns. Each line
#corresponds to a SNP. Column one provides the SNP identifier (rsID where
#possible). Columns two and three contain the chromosome and basepair position
#of the SNP using human reference build 37.1 coordinates. Columns four and five
#contain the two alleles observed at this SNP (genotype). The genotype is reported
#on the forward (+) strand with respect to the human reference.
rsid chromosome position allele1 allele2
rs4477212 1 82154 C C
rs3131972 1 752721 C C
rs12562034 1 768448 C C
rs11240777 1 798959 C C
rs6681049 1 800007 C C
Can't render this file because it has a wrong number of fields in line 17.

View File

@@ -1,3 +1,4 @@
Name,Variation,Chromosome,Position,Strand,YourCode
rs4477212,A/G,1,72017,+,AA
rs2185539,C/T,1,556738,+,CC
rs6681105,C/T,1,581938,+,TT
1 rs4477212 Name A/G Variation 1 Chromosome 72017 Position + Strand AA YourCode
1 Name Variation Chromosome Position Strand YourCode
2 rs4477212 rs4477212 A/G A/G 1 1 72017 72017 + + AA AA
3 rs2185539 rs2185539 C/T C/T 1 1 556738 556738 + + CC CC
4 rs6681105 rs6681105 C/T C/T 1 1 581938 581938 + + TT TT

View File

@@ -0,0 +1,6 @@
RSID,CHROMOSOME,POSITION,RESULT
"rs3094315","1","752566","AA"
"rs3131972","1","752721","GG"
"rs12562034","1","768448","GG"
"rs12124819","1","776546","AA"
"rs11240777","1","798959","AG"
1 RSID CHROMOSOME POSITION RESULT
2 rs3094315 1 752566 AA
3 rs3131972 1 752721 GG
4 rs12562034 1 768448 GG
5 rs12124819 1 776546 AA
6 rs11240777 1 798959 AG

5
test/data/iyg_sample.csv Normal file
View File

@@ -0,0 +1,5 @@
rs2131925 GT
rs2815752 AA
rs10924081 AA
MT-T3027C T
MT-T4336C T
1 rs2131925 GT
2 rs2815752 AA
3 rs10924081 AA
4 MT-T3027C T
5 MT-T4336C T

View File

@@ -14,7 +14,7 @@ unless $factories_already_read
factory :genotype do
genotype_file_name "foo.txt"
association :user
user
end
factory :snp do