mirror of
https://github.com/chenasraf/snpr.git
synced 2026-05-17 17:38:07 +00:00
Refactor Zipfulldata worker (#541)
* Breaks up Zipfulldata worker into service classes * Fixes N+1 queries for phenotype and picture phenotype CSVs * moving phenotype CSV generation into database for performance * Fixes unintentional deletion of unrelated files * Reduces the time it takes to assemble the zip file from about 10 to about 5 hours, with the bottle-neck being zipping the genotype files
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class Achievement < ActiveRecord::Base
|
||||
|
||||
class Achievement < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
has_many :user_achievements
|
||||
pg_search_common_scope against: :award
|
||||
|
||||
16
app/models/application_record.rb
Normal file
16
app/models/application_record.rb
Normal file
@@ -0,0 +1,16 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
class ApplicationRecord < ActiveRecord::Base
|
||||
self.abstract_class = true
|
||||
|
||||
def self.copy_csv(sql)
|
||||
Enumerator.new do |y|
|
||||
conn = ActiveRecord::Base.connection.raw_connection
|
||||
conn.copy_data "COPY (#{sql}) TO STDOUT WITH CSV HEADER DELIMITER ';'" do
|
||||
while row = conn.get_copy_data
|
||||
y << row
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class GenomeGovPaper < ActiveRecord::Base
|
||||
|
||||
class GenomeGovPaper < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
has_many :snp_references, as: :paper
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'fileutils'
|
||||
|
||||
class Genotype < ActiveRecord::Base
|
||||
class Genotype < ApplicationRecord
|
||||
belongs_to :user
|
||||
has_many :user_snps, dependent: :delete_all
|
||||
validates_presence_of :user
|
||||
@@ -20,7 +21,7 @@ class Genotype < ActiveRecord::Base
|
||||
end
|
||||
|
||||
def fs_filename
|
||||
"#{user.id}.#{filetype}.#{id}"
|
||||
"#{user_id}.#{filetype}.#{id}"
|
||||
end
|
||||
|
||||
Paperclip.interpolates :fs_filename do |attachment, style|
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class Homepage < ActiveRecord::Base
|
||||
|
||||
class Homepage < ApplicationRecord
|
||||
belongs_to :user
|
||||
after_save :destroy_if_blank
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class MendeleyPaper < ActiveRecord::Base
|
||||
|
||||
class MendeleyPaper < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
has_many :snp_references, as: :paper
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class Message < ActiveRecord::Base
|
||||
|
||||
class Message < ApplicationRecord
|
||||
attr_encrypted :body, key: ENV.fetch('USER_DATA_SECRET_KEY')
|
||||
attr_encrypted :subject, key: ENV.fetch('USER_DATA_SECRET_KEY')
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
class OpenHumansProfile < ActiveRecord::Base
|
||||
class OpenHumansProfile < ApplicationRecord
|
||||
belongs_to :user
|
||||
end
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class PgpAnnotation < ActiveRecord::Base
|
||||
|
||||
class PgpAnnotation < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
belongs_to :snp
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class Phenotype < ActiveRecord::Base
|
||||
|
||||
class Phenotype < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
has_many :user_phenotypes, dependent: :destroy
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class PhenotypeComment < ActiveRecord::Base
|
||||
|
||||
class PhenotypeComment < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
belongs_to :phenotype
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class PhenotypeSet < ActiveRecord::Base
|
||||
|
||||
class PhenotypeSet < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
has_and_belongs_to_many :phenotypes
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class PicturePhenotype < ActiveRecord::Base
|
||||
|
||||
class PicturePhenotype < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
has_many :user_picture_phenotypes, dependent: :destroy
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class PicturePhenotypeComment < ActiveRecord::Base
|
||||
|
||||
class PicturePhenotypeComment < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
belongs_to :picture_phenotype
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class PlosPaper < ActiveRecord::Base
|
||||
|
||||
class PlosPaper < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
has_many :snp_references, as: :paper
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
# frozen_string_literal: true
|
||||
class SearchResult < ActiveRecord::Base
|
||||
|
||||
class SearchResult < ApplicationRecord
|
||||
end
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class Snp < ActiveRecord::Base
|
||||
|
||||
class Snp < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
has_many :user_snps, foreign_key: :snp_name, primary_key: :name
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class SnpComment < ActiveRecord::Base
|
||||
|
||||
class SnpComment < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
belongs_to :snp
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class SnpReference < ActiveRecord::Base
|
||||
|
||||
class SnpReference < ApplicationRecord
|
||||
self.primary_keys = :snp_id, :paper_id, :paper_type
|
||||
belongs_to :snp
|
||||
belongs_to :paper, polymorphic: true
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class SnpediaPaper < ActiveRecord::Base
|
||||
|
||||
class SnpediaPaper < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
has_many :snp_references, as: :paper
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class User < ActiveRecord::Base
|
||||
|
||||
class User < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
has_attached_file :avatar,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class UserAchievement < ActiveRecord::Base
|
||||
|
||||
class UserAchievement < ApplicationRecord
|
||||
belongs_to :achievement
|
||||
belongs_to :user
|
||||
end
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class UserPhenotype < ActiveRecord::Base
|
||||
|
||||
class UserPhenotype < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
belongs_to :phenotype
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class UserPicturePhenotype < ActiveRecord::Base
|
||||
|
||||
class UserPicturePhenotype < ApplicationRecord
|
||||
include PgSearchCommon
|
||||
|
||||
belongs_to :picture_phenotype
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
class UserSession < Authlogic::Session::Base
|
||||
after_persisting :raven_set_user_context
|
||||
after_destroy :raven_clear_user_context
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# frozen_string_literal: true
|
||||
class UserSnp < ActiveRecord::Base
|
||||
|
||||
class UserSnp < ApplicationRecord
|
||||
self.primary_keys = [:genotype_id, :snp_name]
|
||||
belongs_to :snp, foreign_key: :snp_name, primary_key: :name, counter_cache: true
|
||||
has_one :user, through: :genotype
|
||||
|
||||
114
app/services/data_zipper_service.rb
Normal file
114
app/services/data_zipper_service.rb
Normal file
@@ -0,0 +1,114 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'zip'
|
||||
require_relative 'data_zipper_service/generate_user_phenotype_csv'
|
||||
require_relative 'data_zipper_service/zip_user_picture_phenotypes'
|
||||
|
||||
class DataZipperService
|
||||
CSV_OPTIONS = { col_sep: ';' }.freeze
|
||||
PUBLIC_PATH = '/data/zip/opensnp_datadump.current.zip'
|
||||
DEFAULT_OUTPUT_DIR = Rails.root.join('public', 'data', 'zip').freeze
|
||||
|
||||
attr_reader :time, :time_str, :zip_public_path, :zip_tmp_path, :tmp_dir,
|
||||
:link_path, :output_dir, :logger
|
||||
|
||||
def initialize(output_dir: DEFAULT_OUTPUT_DIR, logger: Logger.new(STDOUT))
|
||||
@output_dir = output_dir
|
||||
@time = Time.now.utc
|
||||
@time_str = time.strftime('%Y%m%d%H%M')
|
||||
@tmp_dir = Rails.root.join('tmp', "opensnp_datadump.#{time_str}")
|
||||
zip_file_name = "opensnp_datadump.#{time_str}.zip"
|
||||
@zip_public_path = @output_dir.join(zip_file_name)
|
||||
@zip_tmp_path = Rails.root.join('tmp', zip_file_name)
|
||||
@link_path = @output_dir.join('opensnp_datadump.current.zip')
|
||||
@logger = logger
|
||||
end
|
||||
|
||||
def call
|
||||
# only create a new file if in the current minute none has been created yet
|
||||
if Dir.exist?(tmp_dir)
|
||||
logger.error("Directory #{tmp_dir} already exists. Exiting...")
|
||||
return false
|
||||
end
|
||||
|
||||
begin
|
||||
logger.info("Creating temp dir: #{tmp_dir}")
|
||||
Dir.mkdir(tmp_dir)
|
||||
logger.info("Creating zipfile: #{zip_tmp_path}")
|
||||
Zip::File.open(zip_tmp_path, Zip::File::CREATE) do |zipfile|
|
||||
zip_user_phenotypes(zipfile)
|
||||
zip_user_picture_phenotypes(zipfile)
|
||||
zip_readme(zipfile)
|
||||
zip_genotype_files(zipfile)
|
||||
end
|
||||
|
||||
# move from local storage to network storage
|
||||
logger.info("Copying #{zip_tmp_path} to #{zip_public_path}")
|
||||
FileUtils.cp(zip_tmp_path, zip_public_path)
|
||||
logger.info("Deleting #{zip_tmp_path}")
|
||||
FileUtils.rm(zip_tmp_path)
|
||||
logger.info("Creating symlink #{link_path} to #{zip_public_path}")
|
||||
FileUtils.ln_sf(zip_public_path, link_path)
|
||||
|
||||
# everything went OK, now delete old zips
|
||||
delete_old_zips
|
||||
ensure
|
||||
logger.info("Deleting #{tmp_dir}")
|
||||
FileUtils.rm_rf(tmp_dir)
|
||||
end
|
||||
end
|
||||
|
||||
def self.public_path
|
||||
PUBLIC_PATH
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
# Create a CSV with a row for each genotype, with user data and phenotypes as
|
||||
# columns.
|
||||
def zip_user_phenotypes(zipfile)
|
||||
logger.info('Zipping user phenotypes')
|
||||
zipfile.get_output_stream("phenotypes_#{time_str}.csv") do |f|
|
||||
GenerateUserPhenotypeCsv.new.call.each do |row|
|
||||
f.write(row)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# make a CSV describing all of them - which filename is for which user's phenotype
|
||||
def zip_user_picture_phenotypes(zipfile)
|
||||
logger.info('Zipping user picture phenotypes')
|
||||
ZipUserPicturePhenotypes.new(zipfile, tmp_dir, time_str).call
|
||||
end
|
||||
|
||||
def zip_readme(zipfile)
|
||||
logger.info('Zipping readme')
|
||||
# make a README containing time of zip - this way, users can compare with page-status
|
||||
# and see how old the data is
|
||||
zipfile.get_output_stream('readme.txt') do |f|
|
||||
f.write(
|
||||
I18n.t(
|
||||
'zipfulldata.readme',
|
||||
time: time.ctime,
|
||||
phenotype_count: Phenotype.count,
|
||||
genotype_count: Genotype.count,
|
||||
picture_count: PicturePhenotype.count
|
||||
)
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
def zip_genotype_files(zipfile)
|
||||
logger.info('Zipping genotype files')
|
||||
ZipGenotypeFiles.new(zipfile).call
|
||||
end
|
||||
|
||||
def delete_old_zips
|
||||
forbidden_files = [link_path, zip_public_path].map(&:to_s)
|
||||
Dir[output_dir.join('opensnp_datadump.*.zip')].each do |f|
|
||||
next if forbidden_files.include?(f)
|
||||
logger.info("Deleting #{f}")
|
||||
File.delete(f)
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,68 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
class DataZipperService
|
||||
class GenerateUserPhenotypeCsv
|
||||
def call
|
||||
# Build a pivot table with characteristics and user genotype IDs as dimensions and
|
||||
# variations as values.
|
||||
#
|
||||
# PostgreSQL docs: https://www.postgresql.org/docs/9.6/tablefunc.html#AEN145056
|
||||
ApplicationRecord.copy_csv(<<-SQL)
|
||||
SELECT
|
||||
user_id,
|
||||
fs_filename AS genotype_filename,
|
||||
user_yob AS date_of_birth,
|
||||
user_sex AS chrom_sex,
|
||||
oh_user_name AS openhumans_name,
|
||||
#{characteristics_headers}
|
||||
FROM CROSSTAB(
|
||||
'SELECT genotypes.id, -- unique key, must be first
|
||||
genotypes.user_id,
|
||||
genotypes.user_id || ''.'' || genotypes.filetype || ''.'' || genotypes.id,
|
||||
users.yearofbirth,
|
||||
users.sex,
|
||||
COALESCE(open_humans_profiles.open_humans_user_id, ''-''),
|
||||
phenotypes.characteristic, -- column headers, must be second to last
|
||||
user_phenotypes.variation -- values, must be last
|
||||
FROM genotypes
|
||||
JOIN users ON users.id = genotypes.user_id
|
||||
LEFT JOIN user_phenotypes ON user_phenotypes.user_id = genotypes.user_id
|
||||
LEFT JOIN phenotypes ON phenotypes.id = user_phenotypes.phenotype_id
|
||||
LEFT JOIN open_humans_profiles ON open_humans_profiles.user_id = users.id
|
||||
ORDER BY genotypes.id, phenotypes.id',
|
||||
'#{phenotypes.to_sql}'
|
||||
) AS ct_variations(
|
||||
genotype_id integer,
|
||||
user_id integer,
|
||||
fs_filename text,
|
||||
user_yob text,
|
||||
user_sex text,
|
||||
oh_user_name text,
|
||||
#{characteristics_types}
|
||||
)
|
||||
ORDER BY user_id, genotype_id
|
||||
SQL
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def phenotypes
|
||||
@phenotypes ||= Phenotype.select(:characteristic).order(:id)
|
||||
end
|
||||
|
||||
def characteristics_headers
|
||||
characteristics.map do |c|
|
||||
header = c.gsub('"', '""')
|
||||
"COALESCE(\"#{header}\", '-') AS \"#{header}\""
|
||||
end.join(', ')
|
||||
end
|
||||
|
||||
def characteristics_types
|
||||
characteristics.map { |c| "\"#{c.gsub('"', '""')}\" text" }.join(', ')
|
||||
end
|
||||
|
||||
def characteristics
|
||||
@characteristics ||= phenotypes.pluck(:characteristic)
|
||||
end
|
||||
end
|
||||
end
|
||||
27
app/services/data_zipper_service/zip_genotype_files.rb
Normal file
27
app/services/data_zipper_service/zip_genotype_files.rb
Normal file
@@ -0,0 +1,27 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
class DataZipperService
|
||||
class ZipGenotypeFiles
|
||||
def initialize(zipfile)
|
||||
@zipfile = zipfile
|
||||
end
|
||||
|
||||
attr_reader :zipfile
|
||||
|
||||
def call
|
||||
Genotype.includes(:user).find_each do |genotype|
|
||||
next unless File.exist?(genotype.genotype.path)
|
||||
|
||||
user = genotype.user
|
||||
yob = user.yearofbirth == 'rather not say' ? 'unknown' : user.yearofbirth
|
||||
sex = user.sex == 'rather not say' ? 'unknown' : user.sex
|
||||
|
||||
zipfile.add(
|
||||
"user#{genotype.user_id}_file#{genotype.id}_yearofbirth_#{yob}_" \
|
||||
"sex_#{sex}.#{genotype.filetype}.txt",
|
||||
genotype.genotype.path
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,71 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
require 'csv'
|
||||
|
||||
class DataZipperService
|
||||
class ZipUserPicturePhenotypes
|
||||
CSV_BASE_HEADER = %w(user_id date_of_birth chrom_sex).freeze
|
||||
|
||||
def initialize(zipfile, tmp_dir, time_str)
|
||||
@zipfile = zipfile
|
||||
@tmp_dir = tmp_dir
|
||||
@time_str = time_str
|
||||
end
|
||||
|
||||
attr_reader :zipfile, :tmp_dir, :time_str
|
||||
|
||||
def call
|
||||
picture_phenotypes = PicturePhenotype.order(:id)
|
||||
csv_head = CSV_BASE_HEADER + picture_phenotypes.pluck(:characteristic)
|
||||
picture_zip = Zip::File.open(
|
||||
tmp_dir.join("opensnp_picturedump.#{time_str}.zip"),
|
||||
Zip::File::CREATE
|
||||
)
|
||||
|
||||
user_picture_phenotypes_csv = CSV.generate(CSV_OPTIONS) do |csv|
|
||||
csv << csv_head
|
||||
|
||||
User
|
||||
.order(:id)
|
||||
.includes(:user_picture_phenotypes)
|
||||
.find_each do |user|
|
||||
csv << build_user_picture_phenotype_row(user, picture_phenotypes, picture_zip)
|
||||
end
|
||||
end
|
||||
|
||||
picture_zip.close
|
||||
|
||||
zipfile.get_output_stream("picture_phenotypes_#{time_str}.csv") do |f|
|
||||
f.write(user_picture_phenotypes_csv)
|
||||
end
|
||||
zipfile.add("picture_phenotypes_#{time_str}_all_pics.zip", picture_zip.name)
|
||||
end
|
||||
|
||||
def build_user_picture_phenotype_row(user, picture_phenotypes, picture_zip)
|
||||
user_picture_phenotypes = user
|
||||
.user_picture_phenotypes
|
||||
.index_by(&:picture_phenotype_id)
|
||||
|
||||
[
|
||||
user.id,
|
||||
user.yearofbirth,
|
||||
user.sex
|
||||
] + picture_phenotypes.map do |picture_phenotype|
|
||||
user_picture_phenotype = user_picture_phenotypes[picture_phenotype.id]
|
||||
if user_picture_phenotype && user_picture_phenotype.phenotype_picture.present?
|
||||
extension = user_picture_phenotype
|
||||
.phenotype_picture
|
||||
.content_type
|
||||
.split('/')
|
||||
.last
|
||||
extension = 'jpg' if extension == 'jpeg'
|
||||
file_name = "#{user_picture_phenotype.id}.#{extension}"
|
||||
picture_zip.add(file_name, user_picture_phenotype.phenotype_picture.path)
|
||||
file_name
|
||||
else
|
||||
'-'
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -12,7 +12,7 @@
|
||||
</div>
|
||||
</div>
|
||||
<div class="genotype__download-container col-md-6 ">
|
||||
<%= link_to Zipfulldata.public_path, title: "Request download", class: "btn btn-default center-block genotype__download-button" do %>
|
||||
<%= link_to DataZipperService.public_path, title: "Request download", class: "btn btn-default center-block genotype__download-button" do %>
|
||||
Download all data
|
||||
<% end %>
|
||||
<p class="text-center genotype__text-download">Includes all genotyping files, a CSV with all phenotypes of those users,</br> and all picture phenotypes. A preprocessed dump of 5,000 datasets </br>from February 2020 exists on <a href="https://supfam.mrc-lmb.cam.ac.uk/GenomePrep/downloads.html">GenomePrep</a></p>
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
<h3>Listing all SNPs</h3>
|
||||
</div>
|
||||
<div class="snps__download-container col-md-6">
|
||||
<%= link_to Zipfulldata.public_path, title: "Request download", class: "btn btn-default center-block snps__download-button" do %>
|
||||
<%= link_to DataZipperService.public_path, title: "Request download", class: "btn btn-default center-block snps__download-button" do %>
|
||||
Download dump
|
||||
<% end %>
|
||||
<p class="text-center snps__text-download">Includes annotation for all SNPs from all sources</p>
|
||||
|
||||
13
app/workers/data_zipper_worker.rb
Normal file
13
app/workers/data_zipper_worker.rb
Normal file
@@ -0,0 +1,13 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
class DataZipperWorker
|
||||
include Sidekiq::Worker
|
||||
sidekiq_options queue: :zipfulldata, retry: 0, unique: true, dead: false
|
||||
# can't do retry => false.
|
||||
# Note with retry disabled, Sidekiq will not track or save any error data for the worker's jobs.
|
||||
# dead => false means don't send dead job to the dead queue, we don't care about that
|
||||
|
||||
def perform
|
||||
DataZipperService.new(logger: logger).call
|
||||
end
|
||||
end
|
||||
@@ -1,227 +0,0 @@
|
||||
|
||||
# frozen_string_literal: true
|
||||
require 'csv'
|
||||
require 'zip'
|
||||
|
||||
|
||||
class Zipfulldata
|
||||
include Sidekiq::Worker
|
||||
sidekiq_options queue: :zipfulldata, retry: 0, unique: true, dead: false
|
||||
# can't do retry => false.
|
||||
# Note with retry disabled, Sidekiq will not track or save any error data for the worker's jobs.
|
||||
# dead => false means don't send dead job to the dead queue, we don't care about that
|
||||
|
||||
attr_reader :time, :time_str, :csv_options, :dump_file_name, :zip_public_path,
|
||||
:zip_fs_path, :tmp_dir, :link_path
|
||||
|
||||
def perform
|
||||
logger.info('job started')
|
||||
run
|
||||
logger.info('job done')
|
||||
end
|
||||
|
||||
def initialize
|
||||
@time = Time.now.utc
|
||||
@time_str = time.strftime("%Y%m%d%H%M")
|
||||
@csv_options = { col_sep: ';' }
|
||||
@dump_file_name = "opensnp_datadump.#{time_str}"
|
||||
@zip_public_path = "public/data/zip/#{dump_file_name}.zip"
|
||||
@zip_fs_path = "/tmp/#{dump_file_name}.zip"
|
||||
@tmp_dir = "#{Rails.root}/tmp/#{dump_file_name}"
|
||||
@link_path = Rails.root.join('public/data/zip/opensnp_datadump.current.zip')
|
||||
end
|
||||
|
||||
def run
|
||||
genotypes = Genotype.includes(user: :user_phenotypes)
|
||||
logger.info("Got #{genotypes.length} genotypes")
|
||||
|
||||
# only create a new file if in the current minute none has been created yet
|
||||
if Dir.exists?(tmp_dir)
|
||||
logger.info("Directory #{tmp_dir} already exists. Exiting...")
|
||||
return false
|
||||
end
|
||||
|
||||
begin
|
||||
logger.info("Making tmpdir #{tmp_dir}")
|
||||
Dir.mkdir(tmp_dir)
|
||||
logger.info("Starting zipfile #{zip_fs_path}")
|
||||
Zip::File.open(zip_fs_path, Zip::File::CREATE) do |zipfile|
|
||||
create_user_csv(genotypes, zipfile)
|
||||
list_of_pics = create_picture_phenotype_csv(zipfile)
|
||||
create_picture_zip(list_of_pics, zipfile)
|
||||
create_readme(zipfile)
|
||||
zip_genotype_files(genotypes, zipfile)
|
||||
end
|
||||
# move from local storage to network storage
|
||||
FileUtils.cp(@zip_fs_path, Rails.root.join("public/data/zip/#{dump_file_name}.zip"))
|
||||
FileUtils.rm(@zip_fs_path)
|
||||
logger.info('created zip-file')
|
||||
|
||||
FileUtils.ln_sf(
|
||||
Rails.root.join("public/data/zip/#{dump_file_name}.zip"),
|
||||
link_path)
|
||||
|
||||
# everything went OK, now delete old zips
|
||||
delete_old_zips
|
||||
|
||||
ensure
|
||||
FileUtils.rm_rf(tmp_dir)
|
||||
end
|
||||
true
|
||||
end
|
||||
|
||||
def create_user_csv(genotypes, zipfile)
|
||||
phenotypes = Phenotype.all
|
||||
csv_file_name = "#{tmp_dir}/dump#{time_str}.csv"
|
||||
csv_head = %w(user_id genotype_filename date_of_birth chrom_sex openhumans_name)
|
||||
csv_head.concat(phenotypes.map(&:characteristic))
|
||||
|
||||
CSV.open(csv_file_name, "w", csv_options) do |csv|
|
||||
csv << csv_head
|
||||
|
||||
# create lines in csv-file for each user who has uploaded his data
|
||||
genotypes.each do |genotype|
|
||||
user = genotype.user
|
||||
oh_name = user.open_humans_profile&.open_humans_user_id || '-'
|
||||
row = [user.id, genotype.fs_filename, user.yearofbirth, user.sex, oh_name]
|
||||
|
||||
phenotypes.each do |phenotype|
|
||||
if up = user.user_phenotypes.where(phenotype_id: phenotype.id).first
|
||||
row << up.variation
|
||||
else
|
||||
row << "-"
|
||||
end
|
||||
end
|
||||
csv << row
|
||||
end
|
||||
end
|
||||
logger.info('created user csv')
|
||||
zipfile.add("phenotypes_#{time_str}.csv", csv_file_name)
|
||||
end
|
||||
|
||||
# make a CSV describing all of them - which filename is for which user's phenotype
|
||||
def create_picture_phenotype_csv(zipfile)
|
||||
file_name = "#{tmp_dir}/picture_dump#{time_str}.csv"
|
||||
logger.info("Writing picture-CSV to #{file_name}")
|
||||
|
||||
list_of_pics = [] # need this for the zip-file-later
|
||||
|
||||
picture_phenotypes = PicturePhenotype.all
|
||||
csv_head = %w(user_id date_of_birth chrom_sex)
|
||||
csv_head.concat(picture_phenotypes.map(&:characteristic))
|
||||
|
||||
CSV.open(file_name, "w", csv_options) do |csv|
|
||||
|
||||
csv << csv_head
|
||||
|
||||
# create lines in csv-file for each user who has uploaded his data
|
||||
|
||||
User.includes(:user_picture_phenotypes).order(:id).each do |u|
|
||||
logger.info("Looking at user #{u.id}")
|
||||
row = [u.id, u.yearofbirth, u.sex]
|
||||
picture_phenotypes.each do |pp|
|
||||
|
||||
# copy the picture with name to +user_id+_+pic_phenotype_id+.png
|
||||
# logger.info("Looking for this picture #{pp.id}")
|
||||
picture = pp.user_picture_phenotypes.where(user_id: u.id).first
|
||||
# does this user have this pic?
|
||||
if picture.present? && picture.phenotype_picture.present?
|
||||
picture_path = picture.phenotype_picture.path
|
||||
basename = picture_path.split("/")[-1]
|
||||
filetype = basename.split(".")[-1]
|
||||
logger.info("FOUND file #{picture_path}, basename is #{basename}")
|
||||
|
||||
list_of_pics << picture
|
||||
row << "#{picture.id}.#{filetype}"
|
||||
else
|
||||
row << '-'
|
||||
end
|
||||
end
|
||||
logger.info('Putting a line into CSV')
|
||||
csv << row
|
||||
end
|
||||
end
|
||||
logger.info('created picture handle csv-file')
|
||||
zipfile.add("picture_phenotypes_#{time_str}.csv", file_name)
|
||||
list_of_pics
|
||||
end
|
||||
|
||||
def create_picture_zip(list_of_pics, zipfile)
|
||||
pic_zipname = "/data/zip/opensnp_picturedump."+time_str+".zip"
|
||||
Zip::File.open("#{Rails.root}/public/#{pic_zipname}", Zip::File::CREATE) do |z|
|
||||
list_of_pics.each do |tmp|
|
||||
begin
|
||||
file_name = tmp.phenotype_picture.path
|
||||
basename = file_name.split("/")[-1]
|
||||
filetype = basename.split(".")[-1]
|
||||
logger.info("Adding file to zip named #{tmp.id.to_s + "." + filetype}")
|
||||
z.add(tmp.id.to_s+"."+filetype, file_name)
|
||||
logger.info("Added #{tmp.id.to_s + "." + filetype}")
|
||||
rescue => e
|
||||
logger.info("create_picture_zip: #{e.class}: #{e.message}")
|
||||
end
|
||||
end
|
||||
end
|
||||
zipfile.add("picture_phenotypes_#{time_str}_all_pics.zip",
|
||||
"#{Rails.root}/public/#{pic_zipname}")
|
||||
logger.info('created picture zip file')
|
||||
end
|
||||
|
||||
def create_readme(zipfile)
|
||||
# make a README containing time of zip - this way, users can compare with page-status
|
||||
# and see how old the data is
|
||||
phenotype_count = Phenotype.count
|
||||
genotype_count = Genotype.count
|
||||
picture_count = PicturePhenotype.count
|
||||
File.open("#{tmp_dir}/dump#{time_str}.txt", "w") do |readme|
|
||||
readme.puts(<<-TXT)
|
||||
This archive was generated on #{time.ctime} UTC. It contains #{phenotype_count} phenotypes, #{genotype_count} genotypes and #{picture_count} picture phenotypes.
|
||||
|
||||
Thanks for using openSNP!
|
||||
TXT
|
||||
end
|
||||
zipfile.add("readme.txt", "#{tmp_dir}/dump#{time_str}.txt")
|
||||
end
|
||||
|
||||
def zip_genotype_files(genotypes, zipfile)
|
||||
genotypes.each do |gen_file|
|
||||
yob = gen_file.user.yearofbirth
|
||||
sex = gen_file.user.sex
|
||||
to_zip_file = "#{Rails.root}/public/data/#{gen_file.fs_filename}"
|
||||
|
||||
if yob == "rather not say"
|
||||
yob = "unknown"
|
||||
end
|
||||
if sex == "rather not say"
|
||||
sex = "unknown"
|
||||
end
|
||||
|
||||
zipfile.add("user#{gen_file.user_id}_file#{gen_file.id}_yearofbirth_#{yob}_sex_#{sex}.#{gen_file.filetype}.txt",
|
||||
to_zip_file) unless !File.exist? to_zip_file
|
||||
end
|
||||
end
|
||||
|
||||
def delete_old_zips
|
||||
forbidden_files = [link_path,
|
||||
Rails.root.join('data', 'annotation.zip').to_s,
|
||||
Rails.root.join('public', 'data', 'zip', "#{dump_file_name}.zip").to_s]
|
||||
Dir[Rails.root.join('public/data/zip/*.zip')].each do |f|
|
||||
if (not forbidden_files.include? f) and (File.ftype(f) == "file")
|
||||
File.delete(f)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def self.public_path
|
||||
'/data/zip/opensnp_datadump.current.zip'
|
||||
end
|
||||
|
||||
def self.gb_size
|
||||
file = Rails.root.join('public', self.public_path)
|
||||
if File.file? file
|
||||
"(Size: #{(File.size(file).to_f / (2**30)).round(2)})"
|
||||
else
|
||||
""
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -10,3 +10,8 @@ en:
|
||||
attributes:
|
||||
phenotype:
|
||||
taken: 'has already been entered.'
|
||||
zipfulldata:
|
||||
readme: |
|
||||
This archive was generated on %{time} UTC. It contains %{phenotype_count} phenotypes, %{genotype_count} genotypes and %{picture_count} picture phenotypes.
|
||||
|
||||
Thanks for using openSNP!
|
||||
|
||||
@@ -34,5 +34,5 @@ PROPERTIES = %w{
|
||||
}.freeze
|
||||
|
||||
100.times do
|
||||
Phenotype.create! characteristic: "#{BODY_PARTS.sample} #{PROPERTIES.sample}"
|
||||
Phenotype.find_or_create_by(characteristic: "#{BODY_PARTS.sample} #{PROPERTIES.sample}")
|
||||
end
|
||||
|
||||
3687
db/structure.sql
3687
db/structure.sql
File diff suppressed because it is too large
Load Diff
@@ -2,6 +2,6 @@
|
||||
namespace :dump do
|
||||
desc 'dump all the data'
|
||||
task full: :environment do
|
||||
Zipfulldata.perform_async
|
||||
DataZipperWorker.perform_async
|
||||
end
|
||||
end
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
FactoryBot.define do
|
||||
factory :genotype do
|
||||
genotype_file_name { 'foo.txt' }
|
||||
genotype { File.new(Rails.root.join('spec', 'fixtures', 'files', 'genotype.txt')) }
|
||||
user
|
||||
end
|
||||
end
|
||||
|
||||
7
spec/factories/open_humans_profiles.rb
Normal file
7
spec/factories/open_humans_profiles.rb
Normal file
@@ -0,0 +1,7 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
FactoryBot.define do
|
||||
factory :open_humans_profile do
|
||||
sequence(:open_humans_user_id) { |n| "oh-user-#{n}" }
|
||||
end
|
||||
end
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
FactoryBot.define do
|
||||
factory :user_picture_phenotype do
|
||||
phenotype_picture { File.new(Rails.root.join('spec', 'fixtures', 'files', 'image.png')) }
|
||||
variation { 'pink' }
|
||||
end
|
||||
end
|
||||
|
||||
1
spec/fixtures/files/genotype.txt
vendored
Normal file
1
spec/fixtures/files/genotype.txt
vendored
Normal file
@@ -0,0 +1 @@
|
||||
assorted genotype data
|
||||
BIN
spec/fixtures/files/image.png
vendored
Normal file
BIN
spec/fixtures/files/image.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 194 KiB |
14
spec/models/application_record_spec.rb
Normal file
14
spec/models/application_record_spec.rb
Normal file
@@ -0,0 +1,14 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
RSpec.describe ApplicationRecord do
|
||||
describe '.copy_csv' do
|
||||
it 'returns an enumerator' do
|
||||
expect(described_class.copy_csv('SELECT 1')).to be_a(Enumerator)
|
||||
end
|
||||
|
||||
it 'returns the query result as an Array of CSV rows' do
|
||||
expect(described_class.copy_csv('SELECT 1 AS foo, 2 AS bar').to_a)
|
||||
.to eq(["foo;bar\n", "1;2\n"])
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,165 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
RSpec.describe DataZipperService::GenerateUserPhenotypeCsv do
|
||||
subject(:service) { described_class.new }
|
||||
|
||||
# There needs to be at least one phenotype in the database for the CROSSTAB
|
||||
# query to work.
|
||||
let!(:phenotype_1) { create(:phenotype, characteristic: "hitchhiker's thumb") }
|
||||
let!(:phenotype_2) { create(:phenotype, characteristic: 'number of eyes') }
|
||||
|
||||
let(:result) { service.call }
|
||||
let(:parsed_result) do
|
||||
CSV.parse(
|
||||
result.to_a.join,
|
||||
col_sep: ';',
|
||||
headers: :first_row
|
||||
)
|
||||
end
|
||||
|
||||
it 'returns an Enumerator' do
|
||||
expect(result).to be_a(Enumerator)
|
||||
end
|
||||
|
||||
it 'returns something, that passes as CSV' do
|
||||
expect(parsed_result).to be_a(CSV::Table)
|
||||
end
|
||||
|
||||
it 'includes a header in the CSV' do
|
||||
expect(parsed_result.headers).to match(
|
||||
%w[
|
||||
user_id
|
||||
genotype_filename
|
||||
date_of_birth
|
||||
chrom_sex
|
||||
openhumans_name
|
||||
] + Array.new(Phenotype.count) { an_instance_of(String) }
|
||||
)
|
||||
end
|
||||
|
||||
it 'includes all phenotype characteristics as columns' do
|
||||
expect(parsed_result.headers)
|
||||
.to include("hitchhiker's thumb", 'number of eyes')
|
||||
end
|
||||
|
||||
context 'for users without genotypes' do
|
||||
let!(:user) { create(:user) }
|
||||
let!(:user_phenotype) do
|
||||
create(:user_phenotype, user: user, phenotype: phenotype_1, variation: 'yes')
|
||||
end
|
||||
|
||||
it 'does not include their phenotypes in the CSV' do
|
||||
expect(parsed_result['user_id']).not_to include(user.id.to_s)
|
||||
end
|
||||
end
|
||||
|
||||
context 'for users without any phenotypes entered' do
|
||||
let!(:user) { create(:user) }
|
||||
let!(:genotype) { create(:genotype, user: user) }
|
||||
|
||||
it 'includes them in the CSV' do
|
||||
expect(parsed_result['user_id']).to include(user.id.to_s)
|
||||
end
|
||||
end
|
||||
|
||||
context 'for users with genotypes' do
|
||||
let!(:user_1) { create(:user, sex: 'why not', yearofbirth: 1990) }
|
||||
let!(:user_2) { create(:user, sex: 'female', yearofbirth: 1970) }
|
||||
|
||||
let!(:genotype_1) { create(:genotype, user: user_1) }
|
||||
let!(:genotype_2) { create(:genotype, user: user_2) }
|
||||
let!(:genotype_3) { create(:genotype, user: user_2) }
|
||||
|
||||
let!(:user_phenotype_1) do
|
||||
create(
|
||||
:user_phenotype,
|
||||
phenotype: phenotype_1,
|
||||
variation: 'yes',
|
||||
user: user_1
|
||||
)
|
||||
end
|
||||
let!(:user_phenotype_2) do
|
||||
create(
|
||||
:user_phenotype,
|
||||
phenotype: phenotype_1,
|
||||
variation: 'no',
|
||||
user: user_2
|
||||
)
|
||||
end
|
||||
let!(:user_phenotype_3) do
|
||||
create(
|
||||
:user_phenotype,
|
||||
phenotype: phenotype_2,
|
||||
variation: '27',
|
||||
user: user_1
|
||||
)
|
||||
end
|
||||
|
||||
it 'returns a row per genotype' do
|
||||
expect(parsed_result.to_a.size).to eq(4)
|
||||
expect(parsed_result.to_a[1..-1]).to eq(
|
||||
[
|
||||
[
|
||||
user_1.id.to_s,
|
||||
"#{user_1.id}.23andme.#{genotype_1.id}",
|
||||
'1990',
|
||||
'why not',
|
||||
'-',
|
||||
'yes',
|
||||
'27'
|
||||
],
|
||||
[
|
||||
user_2.id.to_s,
|
||||
"#{user_2.id}.23andme.#{genotype_2.id}",
|
||||
'1970',
|
||||
'female',
|
||||
'-',
|
||||
'no',
|
||||
'-'
|
||||
],
|
||||
[
|
||||
user_2.id.to_s,
|
||||
"#{user_2.id}.23andme.#{genotype_3.id}",
|
||||
'1970',
|
||||
'female',
|
||||
'-',
|
||||
'no',
|
||||
'-'
|
||||
]
|
||||
]
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
context 'when a phenotype characteristic contains a double quote' do
|
||||
let!(:genotype) { create(:genotype, user: user) }
|
||||
let!(:user) { create(:user) }
|
||||
let!(:phenotype) { create(:phenotype, characteristic: 'prefers " over \'') }
|
||||
let!(:user_phenotype) do
|
||||
create(:user_phenotype, phenotype: phenotype, variation: 'yes', user: user)
|
||||
end
|
||||
|
||||
let(:result) do
|
||||
CSV.parse(
|
||||
service.call.to_a.join("\n"),
|
||||
col_sep: ';',
|
||||
headers: :first_row
|
||||
)
|
||||
end
|
||||
|
||||
it 'does not fail' do
|
||||
expect(result.headers.last).to eq('prefers " over \'')
|
||||
expect(result.to_a.last.last).to eq('yes')
|
||||
end
|
||||
end
|
||||
|
||||
context 'when a phenotype characteristic clashes with another column name' do
|
||||
before do
|
||||
create(:phenotype, characteristic: 'user_yob')
|
||||
end
|
||||
|
||||
it 'fails' do
|
||||
expect { service.call.to_a }.to raise_error(PG::DuplicateColumn)
|
||||
end
|
||||
end
|
||||
end
|
||||
33
spec/services/data_zipper_service/zip_genotype_files_spec.rb
Normal file
33
spec/services/data_zipper_service/zip_genotype_files_spec.rb
Normal file
@@ -0,0 +1,33 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
describe DataZipperService::ZipGenotypeFiles do
|
||||
subject(:zip) do
|
||||
zipfile = Zip::File.open(Tempfile.new, Zip::File::CREATE)
|
||||
described_class.new(zipfile).call
|
||||
zipfile.close
|
||||
Zip::File.open(zipfile.name)
|
||||
end
|
||||
|
||||
let!(:user_1) { create(:user, yearofbirth: 1970, sex: 'why not') }
|
||||
let!(:genotype_1) { create(:genotype, user: user_1) }
|
||||
let!(:open_humans_profile) do
|
||||
create(:open_humans_profile, user: user_1, open_humans_user_id: 'oh-user')
|
||||
end
|
||||
|
||||
let!(:user_2) { create(:user, yearofbirth: 1994, sex: 'no') }
|
||||
let!(:genotype_2) { create(:genotype, user: user_2) }
|
||||
|
||||
let!(:user_3) { create(:user) }
|
||||
|
||||
it 'zips genotype files' do
|
||||
expect(zip.glob('user*.txt').map(&:name)).to eq(
|
||||
[
|
||||
"user#{user_1.id}_file#{genotype_1.id}_yearofbirth_1970_sex_why not.23andme.txt",
|
||||
"user#{user_2.id}_file#{genotype_2.id}_yearofbirth_1994_sex_no.23andme.txt"
|
||||
]
|
||||
)
|
||||
|
||||
expect(zip.read(zip.glob('user*.txt').first.name))
|
||||
.to eq("assorted genotype data\n")
|
||||
end
|
||||
end
|
||||
@@ -0,0 +1,146 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
RSpec.describe DataZipperService::ZipUserPicturePhenotypes do
|
||||
subject(:zip) do
|
||||
zipfile = Zip::File.open(Tempfile.new, Zip::File::CREATE)
|
||||
described_class.new(zipfile, tmp_dir, time_str).call
|
||||
zipfile.close
|
||||
Zip::File.open(zipfile.name)
|
||||
end
|
||||
|
||||
let(:zipfile_write) { Zip::File.open(zipfile_path, Zip::File::CREATE) }
|
||||
let(:tempfile) { Tempfile.new }
|
||||
let(:zipfile_path) { tempfile.path }
|
||||
let(:tmp_dir) do
|
||||
Rails.root.join('tmp', 'test', 'data_zipper_service', 'zip_user_picture_phenotypes')
|
||||
end
|
||||
let(:time_str) { '123' }
|
||||
|
||||
let!(:user_1) { create(:user, yearofbirth: 1970, sex: 'why not') }
|
||||
let!(:genotype_1) { create(:genotype, user: user_1) }
|
||||
let!(:open_humans_profile) do
|
||||
create(:open_humans_profile, user: user_1, open_humans_user_id: 'oh-user')
|
||||
end
|
||||
|
||||
let!(:user_2) { create(:user, yearofbirth: 1994, sex: 'no') }
|
||||
let!(:genotype_2) { create(:genotype, user: user_2) }
|
||||
|
||||
let!(:user_3) { create(:user, yearofbirth: 1922, sex: 'male') }
|
||||
|
||||
let!(:picture_phenotype_1) do
|
||||
create(:picture_phenotype, characteristic: 'number of eyes')
|
||||
end
|
||||
let!(:picture_phenotype_2) do
|
||||
create(:picture_phenotype, characteristic: 'length of tongue')
|
||||
end
|
||||
|
||||
let!(:user_picture_phenotype_1) do
|
||||
create(
|
||||
:user_picture_phenotype,
|
||||
picture_phenotype: picture_phenotype_1,
|
||||
user: user_1
|
||||
)
|
||||
end
|
||||
let!(:user_picture_phenotype_2) do
|
||||
create(
|
||||
:user_picture_phenotype,
|
||||
picture_phenotype: picture_phenotype_1,
|
||||
user: user_2
|
||||
)
|
||||
end
|
||||
let!(:user_picture_phenotype_3) do
|
||||
create(
|
||||
:user_picture_phenotype,
|
||||
picture_phenotype: picture_phenotype_2,
|
||||
user: user_1
|
||||
)
|
||||
end
|
||||
|
||||
# There needs to be at least one Phenotype for the CROSSTAB query to work.
|
||||
let!(:phenotype) { create(:phenotype) }
|
||||
|
||||
before do
|
||||
FileUtils.mkdir_p(tmp_dir)
|
||||
end
|
||||
|
||||
after do
|
||||
FileUtils.rm_rf(tmp_dir)
|
||||
end
|
||||
|
||||
it 'adds a CSV with image data to the zip file' do
|
||||
picture_phenotypes_csv = zip.glob('picture_phenotypes_*.csv').first
|
||||
expect(CSV.parse(zip.read(picture_phenotypes_csv.name), col_sep: ';')).to eq(
|
||||
[
|
||||
[
|
||||
'user_id',
|
||||
'date_of_birth',
|
||||
'chrom_sex',
|
||||
'number of eyes',
|
||||
'length of tongue'
|
||||
],
|
||||
[
|
||||
user_1.id.to_s,
|
||||
'1970',
|
||||
'why not',
|
||||
"#{user_picture_phenotype_1.id}.png",
|
||||
"#{user_picture_phenotype_3.id}.png"
|
||||
],
|
||||
[
|
||||
user_2.id.to_s,
|
||||
'1994',
|
||||
'no',
|
||||
"#{user_picture_phenotype_2.id}.png",
|
||||
'-'
|
||||
],
|
||||
# TODO: Should users without picture phenotypes show up?
|
||||
[
|
||||
user_3.id.to_s,
|
||||
'1922',
|
||||
'male',
|
||||
'-',
|
||||
'-'
|
||||
]
|
||||
]
|
||||
)
|
||||
end
|
||||
|
||||
it 'creates a ZIP file with phenotype images and adds it to the ZIP file' do
|
||||
zip.extract(
|
||||
zip.glob('picture_phenotypes_*_all_pics.zip').last.name,
|
||||
tmp_dir.join('picture_phenotypes_all_pics.zip')
|
||||
)
|
||||
|
||||
Zip::File.open(tmp_dir.join('picture_phenotypes_all_pics.zip')) do |zip|
|
||||
expect(zip.glob('*').map(&:name).sort).to eq(
|
||||
[
|
||||
user_picture_phenotype_1,
|
||||
user_picture_phenotype_2,
|
||||
user_picture_phenotype_3
|
||||
].map(&:id).sort.map { |id| "#{id}.png" }
|
||||
)
|
||||
end
|
||||
end
|
||||
|
||||
context 'when a user picture phenotype is missing an actual image' do
|
||||
before do
|
||||
user_picture_phenotype_1.phenotype_picture = nil
|
||||
user_picture_phenotype_1.save!
|
||||
end
|
||||
|
||||
it 'ignores them' do
|
||||
zip.extract(
|
||||
zip.glob('picture_phenotypes_*_all_pics.zip').last.name,
|
||||
tmp_dir.join('picture_phenotypes_all_pics.zip')
|
||||
)
|
||||
|
||||
Zip::File.open(tmp_dir.join('picture_phenotypes_all_pics.zip')) do |zip|
|
||||
expect(zip.glob('*').map(&:name).sort).to eq(
|
||||
[
|
||||
user_picture_phenotype_2,
|
||||
user_picture_phenotype_3
|
||||
].map(&:id).sort.map { |id| "#{id}.png" }
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
105
spec/services/data_zipper_service_spec.rb
Normal file
105
spec/services/data_zipper_service_spec.rb
Normal file
@@ -0,0 +1,105 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
describe DataZipperService do
|
||||
subject(:service) { described_class.new(output_dir: output_dir, logger: logger) }
|
||||
|
||||
let(:output_dir) { Rails.root.join('tmp', 'test', 'zipfulldata') }
|
||||
let(:symlink) { output_dir.join('opensnp_datadump.current.zip') }
|
||||
let(:picture_zip) { Dir[output_dir.join('opensnp_picturedump.*.zip')].last }
|
||||
let(:logger) { instance_double(Logger) }
|
||||
|
||||
before do
|
||||
FileUtils.mkdir_p(output_dir)
|
||||
# Add dummy phenotype so the CROSSTAB queries don't trip.
|
||||
create(:phenotype, characteristic: 'affinity for filling out online questionaires')
|
||||
allow(logger).to receive(:info)
|
||||
end
|
||||
|
||||
after do
|
||||
FileUtils.rm_rf(output_dir)
|
||||
end
|
||||
|
||||
it 'creates a new dump file and symlink' do
|
||||
service.call
|
||||
|
||||
expect(File.symlink?(symlink)).to be(true)
|
||||
expect(File.exist?(File.readlink(symlink)))
|
||||
end
|
||||
|
||||
it 'adds a README' do
|
||||
service.call
|
||||
|
||||
Zip::File.open(symlink) do |zip|
|
||||
readme = zip.read('readme.txt')
|
||||
expect(readme).to eq(<<~README)
|
||||
This archive was generated on #{service.time.ctime} UTC. \
|
||||
It contains 1 phenotypes, 0 genotypes and 0 picture phenotypes.
|
||||
|
||||
Thanks for using openSNP!
|
||||
README
|
||||
end
|
||||
end
|
||||
|
||||
it 'adds a phenotype csv' do
|
||||
service.call
|
||||
|
||||
Zip::File.open(symlink) do |zip|
|
||||
expect(zip.glob('phenotypes_*.csv')).to be_present
|
||||
end
|
||||
end
|
||||
|
||||
it 'adds a picture phenotype zip and csv' do
|
||||
service.call
|
||||
|
||||
Zip::File.open(symlink) do |zip|
|
||||
expect(zip.glob('picture_phenotypes_*.csv')).to be_present
|
||||
expect(zip.glob('picture_phenotypes_*_all_pics.zip')).to be_present
|
||||
end
|
||||
end
|
||||
|
||||
it 'adds genotype files to the ZIP' do
|
||||
create(:genotype)
|
||||
|
||||
service.call
|
||||
|
||||
Zip::File.open(symlink) do |zip|
|
||||
expect(zip.glob('user*.txt').count).to eq(1)
|
||||
end
|
||||
end
|
||||
|
||||
context 'when deleting files' do
|
||||
let(:unrelated_file_path) { output_dir.join('do_not_delete_me.zip') }
|
||||
let(:old_dump_file_path) { output_dir.join('opensnp_datadump.197001010000.zip') }
|
||||
|
||||
before do
|
||||
[unrelated_file_path, old_dump_file_path].each do |path|
|
||||
FileUtils.touch(path)
|
||||
end
|
||||
end
|
||||
|
||||
it 'deletes old dump files' do
|
||||
service.call
|
||||
|
||||
expect(File.exist?(old_dump_file_path)).to be(false)
|
||||
end
|
||||
|
||||
it 'does not delete unrelated files' do
|
||||
service.call
|
||||
|
||||
expect(File.exist?(unrelated_file_path)).to be(true)
|
||||
end
|
||||
|
||||
after do
|
||||
[unrelated_file_path, old_dump_file_path].each do |path|
|
||||
FileUtils.rm(path) if File.exist?(path)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '.public_path' do
|
||||
it 'returns the public path of the zip file' do
|
||||
expect(described_class.public_path)
|
||||
.to eq('/data/zip/opensnp_datadump.current.zip')
|
||||
end
|
||||
end
|
||||
end
|
||||
19
spec/workers/data_zipper_worker_spec.rb
Normal file
19
spec/workers/data_zipper_worker_spec.rb
Normal file
@@ -0,0 +1,19 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
RSpec.describe DataZipperWorker do
|
||||
subject(:worker) { described_class.new }
|
||||
|
||||
let(:data_zipper_service) { instance_double(DataZipperService) }
|
||||
|
||||
describe '#perform' do
|
||||
it 'calls DataZipperService' do
|
||||
expect(DataZipperService)
|
||||
.to receive(:new)
|
||||
.with(logger: worker.logger)
|
||||
.and_return(data_zipper_service)
|
||||
expect(data_zipper_service).to receive(:call)
|
||||
|
||||
worker.perform
|
||||
end
|
||||
end
|
||||
end
|
||||
@@ -1,112 +0,0 @@
|
||||
# frozen_string_literal: true
|
||||
describe Zipfulldata do
|
||||
let(:user) { create(:user) }
|
||||
let(:phenotype) { create(:phenotype, characteristic: "jump height") }
|
||||
let!(:user_phenotype) do
|
||||
create(:user_phenotype, phenotype_id: phenotype.id, variation: '1km', user: user)
|
||||
end
|
||||
let(:genotype) do
|
||||
create(:genotype, user_id: user.id,
|
||||
genotype: File.open("#{Rails.root}/test/data/23andMe_test.csv"))
|
||||
end
|
||||
let(:job) { Zipfulldata.new }
|
||||
let(:csv_options) { { col_sep: ';' } }
|
||||
let(:zipfile) { double('zipfile') }
|
||||
|
||||
before do
|
||||
allow(Sidekiq::Client).to receive(:enqueue).with(Preparsing, instance_of(Integer))
|
||||
tmp_dir = job.instance_variable_get(:@tmp_dir) + '_test_' +
|
||||
Digest::SHA1.hexdigest("#{Time.now.to_i}#{rand}")
|
||||
job.instance_variable_set(:@tmp_dir, tmp_dir)
|
||||
FileUtils.touch job.zip_fs_path.to_s
|
||||
Dir.mkdir(tmp_dir)
|
||||
genotype
|
||||
end
|
||||
|
||||
after do
|
||||
link = Rails.root.join("public/data/zip/opensnp_datadump.current.zip")
|
||||
FileUtils.rm(link) if File.exist?(link)
|
||||
FileUtils.rm(job.zip_fs_path) if File.exist?(job.zip_fs_path)
|
||||
FileUtils.rm(job.zip_public_path) if File.exist?(job.zip_public_path)
|
||||
end
|
||||
|
||||
it "creates user CSVs" do
|
||||
user2 = create(:user)
|
||||
genotype2 = create(:genotype, user_id: user2.id)
|
||||
expect(zipfile).to receive(:add).
|
||||
with("phenotypes_#{job.time_str}.csv",
|
||||
"#{job.tmp_dir}/dump#{job.time_str}.csv")
|
||||
job.create_user_csv([genotype, genotype2], zipfile)
|
||||
csv = CSV.read("#{job.tmp_dir}/dump#{job.time_str}.csv", job.csv_options)
|
||||
exp_header = ['user_id', 'genotype_filename', 'date_of_birth', 'chrom_sex',
|
||||
'openhumans_name', phenotype.characteristic]
|
||||
exp_row1 = [user.id.to_s, genotype.fs_filename, user.yearofbirth, user.sex,
|
||||
'-', user.user_phenotypes.first.variation]
|
||||
exp_row2 = [user2.id.to_s, genotype2.fs_filename, user2.yearofbirth,
|
||||
user2.sex, '-', '-']
|
||||
expect(user.user_phenotypes.first.phenotype).to eq(phenotype)
|
||||
expect(csv).to eq([exp_header, exp_row1, exp_row2])
|
||||
end
|
||||
|
||||
it "creates picture phenotype CSVs" do
|
||||
user2 = create(:user)
|
||||
pp = create(:picture_phenotype)
|
||||
upp = create(:user_picture_phenotype, picture_phenotype: pp,
|
||||
user: user)
|
||||
pic = double('picture')
|
||||
expect(pic).to receive(:path).and_return("#{Rails.root}/foo/bar.png")
|
||||
allow_any_instance_of(UserPicturePhenotype).to receive(:phenotype_picture).
|
||||
and_return(pic)
|
||||
expect(zipfile).to receive(:add).
|
||||
with("picture_phenotypes_#{job.time_str}.csv",
|
||||
"#{job.tmp_dir}/picture_dump#{job.time_str}.csv")
|
||||
job.create_picture_phenotype_csv(zipfile)
|
||||
csv = CSV.read("#{job.tmp_dir}/picture_dump#{job.time_str}.csv", csv_options)
|
||||
expect(csv).to eq(
|
||||
[["user_id", "date_of_birth", "chrom_sex", "Eye color"],
|
||||
[user.id.to_s, user.yearofbirth, user.sex, "#{upp.id}.png"],
|
||||
[user2.id.to_s, user2.yearofbirth, user2.sex, '-']]
|
||||
)
|
||||
end
|
||||
|
||||
it "creates a readme file" do
|
||||
expect(Phenotype).to receive(:count).and_return(42)
|
||||
expect(Genotype).to receive(:count).and_return(23)
|
||||
expect(PicturePhenotype).to receive(:count).and_return(5)
|
||||
expect(zipfile).to receive(:add).
|
||||
with("readme.txt", "#{job.tmp_dir}/dump#{job.time_str}.txt")
|
||||
job.create_readme(zipfile)
|
||||
readme = File.read("#{job.tmp_dir}/dump#{job.time_str}.txt")
|
||||
exp_text = <<-TXT
|
||||
This archive was generated on #{job.time.ctime} UTC. It contains 42 phenotypes, 23 genotypes and 5 picture phenotypes.
|
||||
|
||||
Thanks for using openSNP!
|
||||
TXT
|
||||
end
|
||||
|
||||
it "zips genotype files" do
|
||||
expect(zipfile).to receive(:add).with(
|
||||
"user#{user.id}_file#{genotype.id}_yearofbirth_#{user.yearofbirth}" +
|
||||
"_sex_#{user.sex}.#{genotype.filetype}.txt",
|
||||
"#{Rails.root}/public/data/#{genotype.fs_filename}")
|
||||
job.zip_genotype_files([genotype], zipfile)
|
||||
end
|
||||
|
||||
it "runs the job" do
|
||||
upp = double('user_picture_phenotype')
|
||||
expect(Dir).to receive(:exists?).with(job.tmp_dir).and_return(false)
|
||||
expect(Dir).to receive(:mkdir).with(job.tmp_dir)
|
||||
expect(Zip::File).to receive(:open).with(job.zip_fs_path, Zip::File::CREATE).
|
||||
and_yield(zipfile)
|
||||
expect(job).to receive(:create_user_csv).with([genotype], zipfile)
|
||||
expect(job).to receive(:create_picture_phenotype_csv).with(zipfile).and_return([upp])
|
||||
expect(job).to receive(:create_picture_zip).with([upp], zipfile)
|
||||
expect(job).to receive(:create_readme).with(zipfile)
|
||||
expect(job).to receive(:zip_genotype_files).with([genotype], zipfile)
|
||||
expect(FileUtils).to receive(:ln_sf).with(
|
||||
Rails.root.join("public/data/zip/#{job.dump_file_name}.zip"),
|
||||
Rails.root.join("public/data/zip/opensnp_datadump.current.zip"))
|
||||
expect(FileUtils).to receive(:rm_rf).with(job.tmp_dir)
|
||||
expect(job.run).to be(true)
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user