Refactor Zipfulldata worker (#541)

* Breaks up Zipfulldata worker into service classes
* Fixes N+1 queries for phenotype and picture phenotype CSVs
  * moving phenotype CSV generation into database for performance
* Fixes unintentional deletion of unrelated files
* Reduces the time it takes to assemble the zip file from about 10 to about 5 hours, with the bottle-neck being zipping the genotype files
This commit is contained in:
Helge Rausch
2023-01-04 09:58:25 +01:00
committed by GitHub
parent c40e12350c
commit 828d84f1a9
50 changed files with 2703 additions and 2212 deletions

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class Achievement < ActiveRecord::Base
class Achievement < ApplicationRecord
include PgSearchCommon
has_many :user_achievements
pg_search_common_scope against: :award

View File

@@ -0,0 +1,16 @@
# frozen_string_literal: true
class ApplicationRecord < ActiveRecord::Base
self.abstract_class = true
def self.copy_csv(sql)
Enumerator.new do |y|
conn = ActiveRecord::Base.connection.raw_connection
conn.copy_data "COPY (#{sql}) TO STDOUT WITH CSV HEADER DELIMITER ';'" do
while row = conn.get_copy_data
y << row
end
end
end
end
end

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class GenomeGovPaper < ActiveRecord::Base
class GenomeGovPaper < ApplicationRecord
include PgSearchCommon
has_many :snp_references, as: :paper

View File

@@ -1,7 +1,8 @@
# frozen_string_literal: true
require 'fileutils'
class Genotype < ActiveRecord::Base
class Genotype < ApplicationRecord
belongs_to :user
has_many :user_snps, dependent: :delete_all
validates_presence_of :user
@@ -20,7 +21,7 @@ class Genotype < ActiveRecord::Base
end
def fs_filename
"#{user.id}.#{filetype}.#{id}"
"#{user_id}.#{filetype}.#{id}"
end
Paperclip.interpolates :fs_filename do |attachment, style|

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class Homepage < ActiveRecord::Base
class Homepage < ApplicationRecord
belongs_to :user
after_save :destroy_if_blank

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class MendeleyPaper < ActiveRecord::Base
class MendeleyPaper < ApplicationRecord
include PgSearchCommon
has_many :snp_references, as: :paper

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class Message < ActiveRecord::Base
class Message < ApplicationRecord
attr_encrypted :body, key: ENV.fetch('USER_DATA_SECRET_KEY')
attr_encrypted :subject, key: ENV.fetch('USER_DATA_SECRET_KEY')

View File

@@ -1,5 +1,5 @@
# frozen_string_literal: true
class OpenHumansProfile < ActiveRecord::Base
class OpenHumansProfile < ApplicationRecord
belongs_to :user
end

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PgpAnnotation < ActiveRecord::Base
class PgpAnnotation < ApplicationRecord
include PgSearchCommon
belongs_to :snp

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class Phenotype < ActiveRecord::Base
class Phenotype < ApplicationRecord
include PgSearchCommon
has_many :user_phenotypes, dependent: :destroy

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PhenotypeComment < ActiveRecord::Base
class PhenotypeComment < ApplicationRecord
include PgSearchCommon
belongs_to :phenotype

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PhenotypeSet < ActiveRecord::Base
class PhenotypeSet < ApplicationRecord
include PgSearchCommon
has_and_belongs_to_many :phenotypes

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PicturePhenotype < ActiveRecord::Base
class PicturePhenotype < ApplicationRecord
include PgSearchCommon
has_many :user_picture_phenotypes, dependent: :destroy

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PicturePhenotypeComment < ActiveRecord::Base
class PicturePhenotypeComment < ApplicationRecord
include PgSearchCommon
belongs_to :picture_phenotype

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class PlosPaper < ActiveRecord::Base
class PlosPaper < ApplicationRecord
include PgSearchCommon
has_many :snp_references, as: :paper

View File

@@ -1,3 +1,4 @@
# frozen_string_literal: true
class SearchResult < ActiveRecord::Base
class SearchResult < ApplicationRecord
end

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class Snp < ActiveRecord::Base
class Snp < ApplicationRecord
include PgSearchCommon
has_many :user_snps, foreign_key: :snp_name, primary_key: :name

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class SnpComment < ActiveRecord::Base
class SnpComment < ApplicationRecord
include PgSearchCommon
belongs_to :snp

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class SnpReference < ActiveRecord::Base
class SnpReference < ApplicationRecord
self.primary_keys = :snp_id, :paper_id, :paper_type
belongs_to :snp
belongs_to :paper, polymorphic: true

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class SnpediaPaper < ActiveRecord::Base
class SnpediaPaper < ApplicationRecord
include PgSearchCommon
has_many :snp_references, as: :paper

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class User < ActiveRecord::Base
class User < ApplicationRecord
include PgSearchCommon
has_attached_file :avatar,

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class UserAchievement < ActiveRecord::Base
class UserAchievement < ApplicationRecord
belongs_to :achievement
belongs_to :user
end

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class UserPhenotype < ActiveRecord::Base
class UserPhenotype < ApplicationRecord
include PgSearchCommon
belongs_to :phenotype

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class UserPicturePhenotype < ActiveRecord::Base
class UserPicturePhenotype < ApplicationRecord
include PgSearchCommon
belongs_to :picture_phenotype

View File

@@ -1,4 +1,5 @@
# frozen_string_literal: true
class UserSession < Authlogic::Session::Base
after_persisting :raven_set_user_context
after_destroy :raven_clear_user_context

View File

@@ -1,5 +1,6 @@
# frozen_string_literal: true
class UserSnp < ActiveRecord::Base
class UserSnp < ApplicationRecord
self.primary_keys = [:genotype_id, :snp_name]
belongs_to :snp, foreign_key: :snp_name, primary_key: :name, counter_cache: true
has_one :user, through: :genotype

View File

@@ -0,0 +1,114 @@
# frozen_string_literal: true
require 'zip'
require_relative 'data_zipper_service/generate_user_phenotype_csv'
require_relative 'data_zipper_service/zip_user_picture_phenotypes'
class DataZipperService
CSV_OPTIONS = { col_sep: ';' }.freeze
PUBLIC_PATH = '/data/zip/opensnp_datadump.current.zip'
DEFAULT_OUTPUT_DIR = Rails.root.join('public', 'data', 'zip').freeze
attr_reader :time, :time_str, :zip_public_path, :zip_tmp_path, :tmp_dir,
:link_path, :output_dir, :logger
def initialize(output_dir: DEFAULT_OUTPUT_DIR, logger: Logger.new(STDOUT))
@output_dir = output_dir
@time = Time.now.utc
@time_str = time.strftime('%Y%m%d%H%M')
@tmp_dir = Rails.root.join('tmp', "opensnp_datadump.#{time_str}")
zip_file_name = "opensnp_datadump.#{time_str}.zip"
@zip_public_path = @output_dir.join(zip_file_name)
@zip_tmp_path = Rails.root.join('tmp', zip_file_name)
@link_path = @output_dir.join('opensnp_datadump.current.zip')
@logger = logger
end
def call
# only create a new file if in the current minute none has been created yet
if Dir.exist?(tmp_dir)
logger.error("Directory #{tmp_dir} already exists. Exiting...")
return false
end
begin
logger.info("Creating temp dir: #{tmp_dir}")
Dir.mkdir(tmp_dir)
logger.info("Creating zipfile: #{zip_tmp_path}")
Zip::File.open(zip_tmp_path, Zip::File::CREATE) do |zipfile|
zip_user_phenotypes(zipfile)
zip_user_picture_phenotypes(zipfile)
zip_readme(zipfile)
zip_genotype_files(zipfile)
end
# move from local storage to network storage
logger.info("Copying #{zip_tmp_path} to #{zip_public_path}")
FileUtils.cp(zip_tmp_path, zip_public_path)
logger.info("Deleting #{zip_tmp_path}")
FileUtils.rm(zip_tmp_path)
logger.info("Creating symlink #{link_path} to #{zip_public_path}")
FileUtils.ln_sf(zip_public_path, link_path)
# everything went OK, now delete old zips
delete_old_zips
ensure
logger.info("Deleting #{tmp_dir}")
FileUtils.rm_rf(tmp_dir)
end
end
def self.public_path
PUBLIC_PATH
end
private
# Create a CSV with a row for each genotype, with user data and phenotypes as
# columns.
def zip_user_phenotypes(zipfile)
logger.info('Zipping user phenotypes')
zipfile.get_output_stream("phenotypes_#{time_str}.csv") do |f|
GenerateUserPhenotypeCsv.new.call.each do |row|
f.write(row)
end
end
end
# make a CSV describing all of them - which filename is for which user's phenotype
def zip_user_picture_phenotypes(zipfile)
logger.info('Zipping user picture phenotypes')
ZipUserPicturePhenotypes.new(zipfile, tmp_dir, time_str).call
end
def zip_readme(zipfile)
logger.info('Zipping readme')
# make a README containing time of zip - this way, users can compare with page-status
# and see how old the data is
zipfile.get_output_stream('readme.txt') do |f|
f.write(
I18n.t(
'zipfulldata.readme',
time: time.ctime,
phenotype_count: Phenotype.count,
genotype_count: Genotype.count,
picture_count: PicturePhenotype.count
)
)
end
end
def zip_genotype_files(zipfile)
logger.info('Zipping genotype files')
ZipGenotypeFiles.new(zipfile).call
end
def delete_old_zips
forbidden_files = [link_path, zip_public_path].map(&:to_s)
Dir[output_dir.join('opensnp_datadump.*.zip')].each do |f|
next if forbidden_files.include?(f)
logger.info("Deleting #{f}")
File.delete(f)
end
end
end

View File

@@ -0,0 +1,68 @@
# frozen_string_literal: true
class DataZipperService
class GenerateUserPhenotypeCsv
def call
# Build a pivot table with characteristics and user genotype IDs as dimensions and
# variations as values.
#
# PostgreSQL docs: https://www.postgresql.org/docs/9.6/tablefunc.html#AEN145056
ApplicationRecord.copy_csv(<<-SQL)
SELECT
user_id,
fs_filename AS genotype_filename,
user_yob AS date_of_birth,
user_sex AS chrom_sex,
oh_user_name AS openhumans_name,
#{characteristics_headers}
FROM CROSSTAB(
'SELECT genotypes.id, -- unique key, must be first
genotypes.user_id,
genotypes.user_id || ''.'' || genotypes.filetype || ''.'' || genotypes.id,
users.yearofbirth,
users.sex,
COALESCE(open_humans_profiles.open_humans_user_id, ''-''),
phenotypes.characteristic, -- column headers, must be second to last
user_phenotypes.variation -- values, must be last
FROM genotypes
JOIN users ON users.id = genotypes.user_id
LEFT JOIN user_phenotypes ON user_phenotypes.user_id = genotypes.user_id
LEFT JOIN phenotypes ON phenotypes.id = user_phenotypes.phenotype_id
LEFT JOIN open_humans_profiles ON open_humans_profiles.user_id = users.id
ORDER BY genotypes.id, phenotypes.id',
'#{phenotypes.to_sql}'
) AS ct_variations(
genotype_id integer,
user_id integer,
fs_filename text,
user_yob text,
user_sex text,
oh_user_name text,
#{characteristics_types}
)
ORDER BY user_id, genotype_id
SQL
end
private
def phenotypes
@phenotypes ||= Phenotype.select(:characteristic).order(:id)
end
def characteristics_headers
characteristics.map do |c|
header = c.gsub('"', '""')
"COALESCE(\"#{header}\", '-') AS \"#{header}\""
end.join(', ')
end
def characteristics_types
characteristics.map { |c| "\"#{c.gsub('"', '""')}\" text" }.join(', ')
end
def characteristics
@characteristics ||= phenotypes.pluck(:characteristic)
end
end
end

View File

@@ -0,0 +1,27 @@
# frozen_string_literal: true
class DataZipperService
class ZipGenotypeFiles
def initialize(zipfile)
@zipfile = zipfile
end
attr_reader :zipfile
def call
Genotype.includes(:user).find_each do |genotype|
next unless File.exist?(genotype.genotype.path)
user = genotype.user
yob = user.yearofbirth == 'rather not say' ? 'unknown' : user.yearofbirth
sex = user.sex == 'rather not say' ? 'unknown' : user.sex
zipfile.add(
"user#{genotype.user_id}_file#{genotype.id}_yearofbirth_#{yob}_" \
"sex_#{sex}.#{genotype.filetype}.txt",
genotype.genotype.path
)
end
end
end
end

View File

@@ -0,0 +1,71 @@
# frozen_string_literal: true
require 'csv'
class DataZipperService
class ZipUserPicturePhenotypes
CSV_BASE_HEADER = %w(user_id date_of_birth chrom_sex).freeze
def initialize(zipfile, tmp_dir, time_str)
@zipfile = zipfile
@tmp_dir = tmp_dir
@time_str = time_str
end
attr_reader :zipfile, :tmp_dir, :time_str
def call
picture_phenotypes = PicturePhenotype.order(:id)
csv_head = CSV_BASE_HEADER + picture_phenotypes.pluck(:characteristic)
picture_zip = Zip::File.open(
tmp_dir.join("opensnp_picturedump.#{time_str}.zip"),
Zip::File::CREATE
)
user_picture_phenotypes_csv = CSV.generate(CSV_OPTIONS) do |csv|
csv << csv_head
User
.order(:id)
.includes(:user_picture_phenotypes)
.find_each do |user|
csv << build_user_picture_phenotype_row(user, picture_phenotypes, picture_zip)
end
end
picture_zip.close
zipfile.get_output_stream("picture_phenotypes_#{time_str}.csv") do |f|
f.write(user_picture_phenotypes_csv)
end
zipfile.add("picture_phenotypes_#{time_str}_all_pics.zip", picture_zip.name)
end
def build_user_picture_phenotype_row(user, picture_phenotypes, picture_zip)
user_picture_phenotypes = user
.user_picture_phenotypes
.index_by(&:picture_phenotype_id)
[
user.id,
user.yearofbirth,
user.sex
] + picture_phenotypes.map do |picture_phenotype|
user_picture_phenotype = user_picture_phenotypes[picture_phenotype.id]
if user_picture_phenotype && user_picture_phenotype.phenotype_picture.present?
extension = user_picture_phenotype
.phenotype_picture
.content_type
.split('/')
.last
extension = 'jpg' if extension == 'jpeg'
file_name = "#{user_picture_phenotype.id}.#{extension}"
picture_zip.add(file_name, user_picture_phenotype.phenotype_picture.path)
file_name
else
'-'
end
end
end
end
end

View File

@@ -12,7 +12,7 @@
</div>
</div>
<div class="genotype__download-container col-md-6 ">
<%= link_to Zipfulldata.public_path, title: "Request download", class: "btn btn-default center-block genotype__download-button" do %>
<%= link_to DataZipperService.public_path, title: "Request download", class: "btn btn-default center-block genotype__download-button" do %>
Download all data
<% end %>
<p class="text-center genotype__text-download">Includes all genotyping files, a CSV with all phenotypes of those users,</br> and all picture phenotypes. A preprocessed dump of 5,000 datasets </br>from February 2020 exists on <a href="https://supfam.mrc-lmb.cam.ac.uk/GenomePrep/downloads.html">GenomePrep</a></p>

View File

@@ -4,7 +4,7 @@
<h3>Listing all SNPs</h3>
</div>
<div class="snps__download-container col-md-6">
<%= link_to Zipfulldata.public_path, title: "Request download", class: "btn btn-default center-block snps__download-button" do %>
<%= link_to DataZipperService.public_path, title: "Request download", class: "btn btn-default center-block snps__download-button" do %>
Download dump
<% end %>
<p class="text-center snps__text-download">Includes annotation for all SNPs from all sources</p>

View File

@@ -0,0 +1,13 @@
# frozen_string_literal: true
class DataZipperWorker
include Sidekiq::Worker
sidekiq_options queue: :zipfulldata, retry: 0, unique: true, dead: false
# can't do retry => false.
# Note with retry disabled, Sidekiq will not track or save any error data for the worker's jobs.
# dead => false means don't send dead job to the dead queue, we don't care about that
def perform
DataZipperService.new(logger: logger).call
end
end

View File

@@ -1,227 +0,0 @@
# frozen_string_literal: true
require 'csv'
require 'zip'
class Zipfulldata
include Sidekiq::Worker
sidekiq_options queue: :zipfulldata, retry: 0, unique: true, dead: false
# can't do retry => false.
# Note with retry disabled, Sidekiq will not track or save any error data for the worker's jobs.
# dead => false means don't send dead job to the dead queue, we don't care about that
attr_reader :time, :time_str, :csv_options, :dump_file_name, :zip_public_path,
:zip_fs_path, :tmp_dir, :link_path
def perform
logger.info('job started')
run
logger.info('job done')
end
def initialize
@time = Time.now.utc
@time_str = time.strftime("%Y%m%d%H%M")
@csv_options = { col_sep: ';' }
@dump_file_name = "opensnp_datadump.#{time_str}"
@zip_public_path = "public/data/zip/#{dump_file_name}.zip"
@zip_fs_path = "/tmp/#{dump_file_name}.zip"
@tmp_dir = "#{Rails.root}/tmp/#{dump_file_name}"
@link_path = Rails.root.join('public/data/zip/opensnp_datadump.current.zip')
end
def run
genotypes = Genotype.includes(user: :user_phenotypes)
logger.info("Got #{genotypes.length} genotypes")
# only create a new file if in the current minute none has been created yet
if Dir.exists?(tmp_dir)
logger.info("Directory #{tmp_dir} already exists. Exiting...")
return false
end
begin
logger.info("Making tmpdir #{tmp_dir}")
Dir.mkdir(tmp_dir)
logger.info("Starting zipfile #{zip_fs_path}")
Zip::File.open(zip_fs_path, Zip::File::CREATE) do |zipfile|
create_user_csv(genotypes, zipfile)
list_of_pics = create_picture_phenotype_csv(zipfile)
create_picture_zip(list_of_pics, zipfile)
create_readme(zipfile)
zip_genotype_files(genotypes, zipfile)
end
# move from local storage to network storage
FileUtils.cp(@zip_fs_path, Rails.root.join("public/data/zip/#{dump_file_name}.zip"))
FileUtils.rm(@zip_fs_path)
logger.info('created zip-file')
FileUtils.ln_sf(
Rails.root.join("public/data/zip/#{dump_file_name}.zip"),
link_path)
# everything went OK, now delete old zips
delete_old_zips
ensure
FileUtils.rm_rf(tmp_dir)
end
true
end
def create_user_csv(genotypes, zipfile)
phenotypes = Phenotype.all
csv_file_name = "#{tmp_dir}/dump#{time_str}.csv"
csv_head = %w(user_id genotype_filename date_of_birth chrom_sex openhumans_name)
csv_head.concat(phenotypes.map(&:characteristic))
CSV.open(csv_file_name, "w", csv_options) do |csv|
csv << csv_head
# create lines in csv-file for each user who has uploaded his data
genotypes.each do |genotype|
user = genotype.user
oh_name = user.open_humans_profile&.open_humans_user_id || '-'
row = [user.id, genotype.fs_filename, user.yearofbirth, user.sex, oh_name]
phenotypes.each do |phenotype|
if up = user.user_phenotypes.where(phenotype_id: phenotype.id).first
row << up.variation
else
row << "-"
end
end
csv << row
end
end
logger.info('created user csv')
zipfile.add("phenotypes_#{time_str}.csv", csv_file_name)
end
# make a CSV describing all of them - which filename is for which user's phenotype
def create_picture_phenotype_csv(zipfile)
file_name = "#{tmp_dir}/picture_dump#{time_str}.csv"
logger.info("Writing picture-CSV to #{file_name}")
list_of_pics = [] # need this for the zip-file-later
picture_phenotypes = PicturePhenotype.all
csv_head = %w(user_id date_of_birth chrom_sex)
csv_head.concat(picture_phenotypes.map(&:characteristic))
CSV.open(file_name, "w", csv_options) do |csv|
csv << csv_head
# create lines in csv-file for each user who has uploaded his data
User.includes(:user_picture_phenotypes).order(:id).each do |u|
logger.info("Looking at user #{u.id}")
row = [u.id, u.yearofbirth, u.sex]
picture_phenotypes.each do |pp|
# copy the picture with name to +user_id+_+pic_phenotype_id+.png
# logger.info("Looking for this picture #{pp.id}")
picture = pp.user_picture_phenotypes.where(user_id: u.id).first
# does this user have this pic?
if picture.present? && picture.phenotype_picture.present?
picture_path = picture.phenotype_picture.path
basename = picture_path.split("/")[-1]
filetype = basename.split(".")[-1]
logger.info("FOUND file #{picture_path}, basename is #{basename}")
list_of_pics << picture
row << "#{picture.id}.#{filetype}"
else
row << '-'
end
end
logger.info('Putting a line into CSV')
csv << row
end
end
logger.info('created picture handle csv-file')
zipfile.add("picture_phenotypes_#{time_str}.csv", file_name)
list_of_pics
end
def create_picture_zip(list_of_pics, zipfile)
pic_zipname = "/data/zip/opensnp_picturedump."+time_str+".zip"
Zip::File.open("#{Rails.root}/public/#{pic_zipname}", Zip::File::CREATE) do |z|
list_of_pics.each do |tmp|
begin
file_name = tmp.phenotype_picture.path
basename = file_name.split("/")[-1]
filetype = basename.split(".")[-1]
logger.info("Adding file to zip named #{tmp.id.to_s + "." + filetype}")
z.add(tmp.id.to_s+"."+filetype, file_name)
logger.info("Added #{tmp.id.to_s + "." + filetype}")
rescue => e
logger.info("create_picture_zip: #{e.class}: #{e.message}")
end
end
end
zipfile.add("picture_phenotypes_#{time_str}_all_pics.zip",
"#{Rails.root}/public/#{pic_zipname}")
logger.info('created picture zip file')
end
def create_readme(zipfile)
# make a README containing time of zip - this way, users can compare with page-status
# and see how old the data is
phenotype_count = Phenotype.count
genotype_count = Genotype.count
picture_count = PicturePhenotype.count
File.open("#{tmp_dir}/dump#{time_str}.txt", "w") do |readme|
readme.puts(<<-TXT)
This archive was generated on #{time.ctime} UTC. It contains #{phenotype_count} phenotypes, #{genotype_count} genotypes and #{picture_count} picture phenotypes.
Thanks for using openSNP!
TXT
end
zipfile.add("readme.txt", "#{tmp_dir}/dump#{time_str}.txt")
end
def zip_genotype_files(genotypes, zipfile)
genotypes.each do |gen_file|
yob = gen_file.user.yearofbirth
sex = gen_file.user.sex
to_zip_file = "#{Rails.root}/public/data/#{gen_file.fs_filename}"
if yob == "rather not say"
yob = "unknown"
end
if sex == "rather not say"
sex = "unknown"
end
zipfile.add("user#{gen_file.user_id}_file#{gen_file.id}_yearofbirth_#{yob}_sex_#{sex}.#{gen_file.filetype}.txt",
to_zip_file) unless !File.exist? to_zip_file
end
end
def delete_old_zips
forbidden_files = [link_path,
Rails.root.join('data', 'annotation.zip').to_s,
Rails.root.join('public', 'data', 'zip', "#{dump_file_name}.zip").to_s]
Dir[Rails.root.join('public/data/zip/*.zip')].each do |f|
if (not forbidden_files.include? f) and (File.ftype(f) == "file")
File.delete(f)
end
end
end
def self.public_path
'/data/zip/opensnp_datadump.current.zip'
end
def self.gb_size
file = Rails.root.join('public', self.public_path)
if File.file? file
"(Size: #{(File.size(file).to_f / (2**30)).round(2)})"
else
""
end
end
end

View File

@@ -10,3 +10,8 @@ en:
attributes:
phenotype:
taken: 'has already been entered.'
zipfulldata:
readme: |
This archive was generated on %{time} UTC. It contains %{phenotype_count} phenotypes, %{genotype_count} genotypes and %{picture_count} picture phenotypes.
Thanks for using openSNP!

View File

@@ -34,5 +34,5 @@ PROPERTIES = %w{
}.freeze
100.times do
Phenotype.create! characteristic: "#{BODY_PARTS.sample} #{PROPERTIES.sample}"
Phenotype.find_or_create_by(characteristic: "#{BODY_PARTS.sample} #{PROPERTIES.sample}")
end

File diff suppressed because it is too large Load Diff

View File

@@ -2,6 +2,6 @@
namespace :dump do
desc 'dump all the data'
task full: :environment do
Zipfulldata.perform_async
DataZipperWorker.perform_async
end
end

View File

@@ -2,7 +2,7 @@
FactoryBot.define do
factory :genotype do
genotype_file_name { 'foo.txt' }
genotype { File.new(Rails.root.join('spec', 'fixtures', 'files', 'genotype.txt')) }
user
end
end

View File

@@ -0,0 +1,7 @@
# frozen_string_literal: true
FactoryBot.define do
factory :open_humans_profile do
sequence(:open_humans_user_id) { |n| "oh-user-#{n}" }
end
end

View File

@@ -2,6 +2,7 @@
FactoryBot.define do
factory :user_picture_phenotype do
phenotype_picture { File.new(Rails.root.join('spec', 'fixtures', 'files', 'image.png')) }
variation { 'pink' }
end
end

1
spec/fixtures/files/genotype.txt vendored Normal file
View File

@@ -0,0 +1 @@
assorted genotype data

BIN
spec/fixtures/files/image.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 194 KiB

View File

@@ -0,0 +1,14 @@
# frozen_string_literal: true
RSpec.describe ApplicationRecord do
describe '.copy_csv' do
it 'returns an enumerator' do
expect(described_class.copy_csv('SELECT 1')).to be_a(Enumerator)
end
it 'returns the query result as an Array of CSV rows' do
expect(described_class.copy_csv('SELECT 1 AS foo, 2 AS bar').to_a)
.to eq(["foo;bar\n", "1;2\n"])
end
end
end

View File

@@ -0,0 +1,165 @@
# frozen_string_literal: true
RSpec.describe DataZipperService::GenerateUserPhenotypeCsv do
subject(:service) { described_class.new }
# There needs to be at least one phenotype in the database for the CROSSTAB
# query to work.
let!(:phenotype_1) { create(:phenotype, characteristic: "hitchhiker's thumb") }
let!(:phenotype_2) { create(:phenotype, characteristic: 'number of eyes') }
let(:result) { service.call }
let(:parsed_result) do
CSV.parse(
result.to_a.join,
col_sep: ';',
headers: :first_row
)
end
it 'returns an Enumerator' do
expect(result).to be_a(Enumerator)
end
it 'returns something, that passes as CSV' do
expect(parsed_result).to be_a(CSV::Table)
end
it 'includes a header in the CSV' do
expect(parsed_result.headers).to match(
%w[
user_id
genotype_filename
date_of_birth
chrom_sex
openhumans_name
] + Array.new(Phenotype.count) { an_instance_of(String) }
)
end
it 'includes all phenotype characteristics as columns' do
expect(parsed_result.headers)
.to include("hitchhiker's thumb", 'number of eyes')
end
context 'for users without genotypes' do
let!(:user) { create(:user) }
let!(:user_phenotype) do
create(:user_phenotype, user: user, phenotype: phenotype_1, variation: 'yes')
end
it 'does not include their phenotypes in the CSV' do
expect(parsed_result['user_id']).not_to include(user.id.to_s)
end
end
context 'for users without any phenotypes entered' do
let!(:user) { create(:user) }
let!(:genotype) { create(:genotype, user: user) }
it 'includes them in the CSV' do
expect(parsed_result['user_id']).to include(user.id.to_s)
end
end
context 'for users with genotypes' do
let!(:user_1) { create(:user, sex: 'why not', yearofbirth: 1990) }
let!(:user_2) { create(:user, sex: 'female', yearofbirth: 1970) }
let!(:genotype_1) { create(:genotype, user: user_1) }
let!(:genotype_2) { create(:genotype, user: user_2) }
let!(:genotype_3) { create(:genotype, user: user_2) }
let!(:user_phenotype_1) do
create(
:user_phenotype,
phenotype: phenotype_1,
variation: 'yes',
user: user_1
)
end
let!(:user_phenotype_2) do
create(
:user_phenotype,
phenotype: phenotype_1,
variation: 'no',
user: user_2
)
end
let!(:user_phenotype_3) do
create(
:user_phenotype,
phenotype: phenotype_2,
variation: '27',
user: user_1
)
end
it 'returns a row per genotype' do
expect(parsed_result.to_a.size).to eq(4)
expect(parsed_result.to_a[1..-1]).to eq(
[
[
user_1.id.to_s,
"#{user_1.id}.23andme.#{genotype_1.id}",
'1990',
'why not',
'-',
'yes',
'27'
],
[
user_2.id.to_s,
"#{user_2.id}.23andme.#{genotype_2.id}",
'1970',
'female',
'-',
'no',
'-'
],
[
user_2.id.to_s,
"#{user_2.id}.23andme.#{genotype_3.id}",
'1970',
'female',
'-',
'no',
'-'
]
]
)
end
end
context 'when a phenotype characteristic contains a double quote' do
let!(:genotype) { create(:genotype, user: user) }
let!(:user) { create(:user) }
let!(:phenotype) { create(:phenotype, characteristic: 'prefers " over \'') }
let!(:user_phenotype) do
create(:user_phenotype, phenotype: phenotype, variation: 'yes', user: user)
end
let(:result) do
CSV.parse(
service.call.to_a.join("\n"),
col_sep: ';',
headers: :first_row
)
end
it 'does not fail' do
expect(result.headers.last).to eq('prefers " over \'')
expect(result.to_a.last.last).to eq('yes')
end
end
context 'when a phenotype characteristic clashes with another column name' do
before do
create(:phenotype, characteristic: 'user_yob')
end
it 'fails' do
expect { service.call.to_a }.to raise_error(PG::DuplicateColumn)
end
end
end

View File

@@ -0,0 +1,33 @@
# frozen_string_literal: true
describe DataZipperService::ZipGenotypeFiles do
subject(:zip) do
zipfile = Zip::File.open(Tempfile.new, Zip::File::CREATE)
described_class.new(zipfile).call
zipfile.close
Zip::File.open(zipfile.name)
end
let!(:user_1) { create(:user, yearofbirth: 1970, sex: 'why not') }
let!(:genotype_1) { create(:genotype, user: user_1) }
let!(:open_humans_profile) do
create(:open_humans_profile, user: user_1, open_humans_user_id: 'oh-user')
end
let!(:user_2) { create(:user, yearofbirth: 1994, sex: 'no') }
let!(:genotype_2) { create(:genotype, user: user_2) }
let!(:user_3) { create(:user) }
it 'zips genotype files' do
expect(zip.glob('user*.txt').map(&:name)).to eq(
[
"user#{user_1.id}_file#{genotype_1.id}_yearofbirth_1970_sex_why not.23andme.txt",
"user#{user_2.id}_file#{genotype_2.id}_yearofbirth_1994_sex_no.23andme.txt"
]
)
expect(zip.read(zip.glob('user*.txt').first.name))
.to eq("assorted genotype data\n")
end
end

View File

@@ -0,0 +1,146 @@
# frozen_string_literal: true
RSpec.describe DataZipperService::ZipUserPicturePhenotypes do
subject(:zip) do
zipfile = Zip::File.open(Tempfile.new, Zip::File::CREATE)
described_class.new(zipfile, tmp_dir, time_str).call
zipfile.close
Zip::File.open(zipfile.name)
end
let(:zipfile_write) { Zip::File.open(zipfile_path, Zip::File::CREATE) }
let(:tempfile) { Tempfile.new }
let(:zipfile_path) { tempfile.path }
let(:tmp_dir) do
Rails.root.join('tmp', 'test', 'data_zipper_service', 'zip_user_picture_phenotypes')
end
let(:time_str) { '123' }
let!(:user_1) { create(:user, yearofbirth: 1970, sex: 'why not') }
let!(:genotype_1) { create(:genotype, user: user_1) }
let!(:open_humans_profile) do
create(:open_humans_profile, user: user_1, open_humans_user_id: 'oh-user')
end
let!(:user_2) { create(:user, yearofbirth: 1994, sex: 'no') }
let!(:genotype_2) { create(:genotype, user: user_2) }
let!(:user_3) { create(:user, yearofbirth: 1922, sex: 'male') }
let!(:picture_phenotype_1) do
create(:picture_phenotype, characteristic: 'number of eyes')
end
let!(:picture_phenotype_2) do
create(:picture_phenotype, characteristic: 'length of tongue')
end
let!(:user_picture_phenotype_1) do
create(
:user_picture_phenotype,
picture_phenotype: picture_phenotype_1,
user: user_1
)
end
let!(:user_picture_phenotype_2) do
create(
:user_picture_phenotype,
picture_phenotype: picture_phenotype_1,
user: user_2
)
end
let!(:user_picture_phenotype_3) do
create(
:user_picture_phenotype,
picture_phenotype: picture_phenotype_2,
user: user_1
)
end
# There needs to be at least one Phenotype for the CROSSTAB query to work.
let!(:phenotype) { create(:phenotype) }
before do
FileUtils.mkdir_p(tmp_dir)
end
after do
FileUtils.rm_rf(tmp_dir)
end
it 'adds a CSV with image data to the zip file' do
picture_phenotypes_csv = zip.glob('picture_phenotypes_*.csv').first
expect(CSV.parse(zip.read(picture_phenotypes_csv.name), col_sep: ';')).to eq(
[
[
'user_id',
'date_of_birth',
'chrom_sex',
'number of eyes',
'length of tongue'
],
[
user_1.id.to_s,
'1970',
'why not',
"#{user_picture_phenotype_1.id}.png",
"#{user_picture_phenotype_3.id}.png"
],
[
user_2.id.to_s,
'1994',
'no',
"#{user_picture_phenotype_2.id}.png",
'-'
],
# TODO: Should users without picture phenotypes show up?
[
user_3.id.to_s,
'1922',
'male',
'-',
'-'
]
]
)
end
it 'creates a ZIP file with phenotype images and adds it to the ZIP file' do
zip.extract(
zip.glob('picture_phenotypes_*_all_pics.zip').last.name,
tmp_dir.join('picture_phenotypes_all_pics.zip')
)
Zip::File.open(tmp_dir.join('picture_phenotypes_all_pics.zip')) do |zip|
expect(zip.glob('*').map(&:name).sort).to eq(
[
user_picture_phenotype_1,
user_picture_phenotype_2,
user_picture_phenotype_3
].map(&:id).sort.map { |id| "#{id}.png" }
)
end
end
context 'when a user picture phenotype is missing an actual image' do
before do
user_picture_phenotype_1.phenotype_picture = nil
user_picture_phenotype_1.save!
end
it 'ignores them' do
zip.extract(
zip.glob('picture_phenotypes_*_all_pics.zip').last.name,
tmp_dir.join('picture_phenotypes_all_pics.zip')
)
Zip::File.open(tmp_dir.join('picture_phenotypes_all_pics.zip')) do |zip|
expect(zip.glob('*').map(&:name).sort).to eq(
[
user_picture_phenotype_2,
user_picture_phenotype_3
].map(&:id).sort.map { |id| "#{id}.png" }
)
end
end
end
end

View File

@@ -0,0 +1,105 @@
# frozen_string_literal: true
describe DataZipperService do
subject(:service) { described_class.new(output_dir: output_dir, logger: logger) }
let(:output_dir) { Rails.root.join('tmp', 'test', 'zipfulldata') }
let(:symlink) { output_dir.join('opensnp_datadump.current.zip') }
let(:picture_zip) { Dir[output_dir.join('opensnp_picturedump.*.zip')].last }
let(:logger) { instance_double(Logger) }
before do
FileUtils.mkdir_p(output_dir)
# Add dummy phenotype so the CROSSTAB queries don't trip.
create(:phenotype, characteristic: 'affinity for filling out online questionaires')
allow(logger).to receive(:info)
end
after do
FileUtils.rm_rf(output_dir)
end
it 'creates a new dump file and symlink' do
service.call
expect(File.symlink?(symlink)).to be(true)
expect(File.exist?(File.readlink(symlink)))
end
it 'adds a README' do
service.call
Zip::File.open(symlink) do |zip|
readme = zip.read('readme.txt')
expect(readme).to eq(<<~README)
This archive was generated on #{service.time.ctime} UTC. \
It contains 1 phenotypes, 0 genotypes and 0 picture phenotypes.
Thanks for using openSNP!
README
end
end
it 'adds a phenotype csv' do
service.call
Zip::File.open(symlink) do |zip|
expect(zip.glob('phenotypes_*.csv')).to be_present
end
end
it 'adds a picture phenotype zip and csv' do
service.call
Zip::File.open(symlink) do |zip|
expect(zip.glob('picture_phenotypes_*.csv')).to be_present
expect(zip.glob('picture_phenotypes_*_all_pics.zip')).to be_present
end
end
it 'adds genotype files to the ZIP' do
create(:genotype)
service.call
Zip::File.open(symlink) do |zip|
expect(zip.glob('user*.txt').count).to eq(1)
end
end
context 'when deleting files' do
let(:unrelated_file_path) { output_dir.join('do_not_delete_me.zip') }
let(:old_dump_file_path) { output_dir.join('opensnp_datadump.197001010000.zip') }
before do
[unrelated_file_path, old_dump_file_path].each do |path|
FileUtils.touch(path)
end
end
it 'deletes old dump files' do
service.call
expect(File.exist?(old_dump_file_path)).to be(false)
end
it 'does not delete unrelated files' do
service.call
expect(File.exist?(unrelated_file_path)).to be(true)
end
after do
[unrelated_file_path, old_dump_file_path].each do |path|
FileUtils.rm(path) if File.exist?(path)
end
end
end
describe '.public_path' do
it 'returns the public path of the zip file' do
expect(described_class.public_path)
.to eq('/data/zip/opensnp_datadump.current.zip')
end
end
end

View File

@@ -0,0 +1,19 @@
# frozen_string_literal: true
RSpec.describe DataZipperWorker do
subject(:worker) { described_class.new }
let(:data_zipper_service) { instance_double(DataZipperService) }
describe '#perform' do
it 'calls DataZipperService' do
expect(DataZipperService)
.to receive(:new)
.with(logger: worker.logger)
.and_return(data_zipper_service)
expect(data_zipper_service).to receive(:call)
worker.perform
end
end
end

View File

@@ -1,112 +0,0 @@
# frozen_string_literal: true
describe Zipfulldata do
let(:user) { create(:user) }
let(:phenotype) { create(:phenotype, characteristic: "jump height") }
let!(:user_phenotype) do
create(:user_phenotype, phenotype_id: phenotype.id, variation: '1km', user: user)
end
let(:genotype) do
create(:genotype, user_id: user.id,
genotype: File.open("#{Rails.root}/test/data/23andMe_test.csv"))
end
let(:job) { Zipfulldata.new }
let(:csv_options) { { col_sep: ';' } }
let(:zipfile) { double('zipfile') }
before do
allow(Sidekiq::Client).to receive(:enqueue).with(Preparsing, instance_of(Integer))
tmp_dir = job.instance_variable_get(:@tmp_dir) + '_test_' +
Digest::SHA1.hexdigest("#{Time.now.to_i}#{rand}")
job.instance_variable_set(:@tmp_dir, tmp_dir)
FileUtils.touch job.zip_fs_path.to_s
Dir.mkdir(tmp_dir)
genotype
end
after do
link = Rails.root.join("public/data/zip/opensnp_datadump.current.zip")
FileUtils.rm(link) if File.exist?(link)
FileUtils.rm(job.zip_fs_path) if File.exist?(job.zip_fs_path)
FileUtils.rm(job.zip_public_path) if File.exist?(job.zip_public_path)
end
it "creates user CSVs" do
user2 = create(:user)
genotype2 = create(:genotype, user_id: user2.id)
expect(zipfile).to receive(:add).
with("phenotypes_#{job.time_str}.csv",
"#{job.tmp_dir}/dump#{job.time_str}.csv")
job.create_user_csv([genotype, genotype2], zipfile)
csv = CSV.read("#{job.tmp_dir}/dump#{job.time_str}.csv", job.csv_options)
exp_header = ['user_id', 'genotype_filename', 'date_of_birth', 'chrom_sex',
'openhumans_name', phenotype.characteristic]
exp_row1 = [user.id.to_s, genotype.fs_filename, user.yearofbirth, user.sex,
'-', user.user_phenotypes.first.variation]
exp_row2 = [user2.id.to_s, genotype2.fs_filename, user2.yearofbirth,
user2.sex, '-', '-']
expect(user.user_phenotypes.first.phenotype).to eq(phenotype)
expect(csv).to eq([exp_header, exp_row1, exp_row2])
end
it "creates picture phenotype CSVs" do
user2 = create(:user)
pp = create(:picture_phenotype)
upp = create(:user_picture_phenotype, picture_phenotype: pp,
user: user)
pic = double('picture')
expect(pic).to receive(:path).and_return("#{Rails.root}/foo/bar.png")
allow_any_instance_of(UserPicturePhenotype).to receive(:phenotype_picture).
and_return(pic)
expect(zipfile).to receive(:add).
with("picture_phenotypes_#{job.time_str}.csv",
"#{job.tmp_dir}/picture_dump#{job.time_str}.csv")
job.create_picture_phenotype_csv(zipfile)
csv = CSV.read("#{job.tmp_dir}/picture_dump#{job.time_str}.csv", csv_options)
expect(csv).to eq(
[["user_id", "date_of_birth", "chrom_sex", "Eye color"],
[user.id.to_s, user.yearofbirth, user.sex, "#{upp.id}.png"],
[user2.id.to_s, user2.yearofbirth, user2.sex, '-']]
)
end
it "creates a readme file" do
expect(Phenotype).to receive(:count).and_return(42)
expect(Genotype).to receive(:count).and_return(23)
expect(PicturePhenotype).to receive(:count).and_return(5)
expect(zipfile).to receive(:add).
with("readme.txt", "#{job.tmp_dir}/dump#{job.time_str}.txt")
job.create_readme(zipfile)
readme = File.read("#{job.tmp_dir}/dump#{job.time_str}.txt")
exp_text = <<-TXT
This archive was generated on #{job.time.ctime} UTC. It contains 42 phenotypes, 23 genotypes and 5 picture phenotypes.
Thanks for using openSNP!
TXT
end
it "zips genotype files" do
expect(zipfile).to receive(:add).with(
"user#{user.id}_file#{genotype.id}_yearofbirth_#{user.yearofbirth}" +
"_sex_#{user.sex}.#{genotype.filetype}.txt",
"#{Rails.root}/public/data/#{genotype.fs_filename}")
job.zip_genotype_files([genotype], zipfile)
end
it "runs the job" do
upp = double('user_picture_phenotype')
expect(Dir).to receive(:exists?).with(job.tmp_dir).and_return(false)
expect(Dir).to receive(:mkdir).with(job.tmp_dir)
expect(Zip::File).to receive(:open).with(job.zip_fs_path, Zip::File::CREATE).
and_yield(zipfile)
expect(job).to receive(:create_user_csv).with([genotype], zipfile)
expect(job).to receive(:create_picture_phenotype_csv).with(zipfile).and_return([upp])
expect(job).to receive(:create_picture_zip).with([upp], zipfile)
expect(job).to receive(:create_readme).with(zipfile)
expect(job).to receive(:zip_genotype_files).with([genotype], zipfile)
expect(FileUtils).to receive(:ln_sf).with(
Rails.root.join("public/data/zip/#{job.dump_file_name}.zip"),
Rails.root.join("public/data/zip/opensnp_datadump.current.zip"))
expect(FileUtils).to receive(:rm_rf).with(job.tmp_dir)
expect(job.run).to be(true)
end
end