new comands for staging and reset

parent 963e3e0d
This diff is collapsed.
...@@ -35,12 +35,9 @@ module Embulk ...@@ -35,12 +35,9 @@ module Embulk
page.each do |record| page.each do |record|
reference = record[0] reference = record[0]
data_chunk = Base64.decode64(record[1]) data_chunk = Base64.decode64(record[1])
data_set_directory = @output_path.end_with?("/") ? @output_path : @output_path + "/" @dataset_utils = DatasetUtils.new("")
ref = reference.reverse.sub("/".reverse, ".".reverse).reverse.sub(record[2]+"/", "") data_set_directory = @dataset_utils.appendSlashTo(@output_path)
if ref.end_with?(".none") file_path = @dataset_utils.referenceToPath(reference, data_set_directory, record[2])
ref = ref[0...-5]
end
file_path = data_set_directory + ref
write_mode = 'ab' write_mode = 'ab'
if record[3] == DatasetUtils::DELETE if record[3] == DatasetUtils::DELETE
File.delete(file_path) if File.exist?(file_path) File.delete(file_path) if File.exist?(file_path)
...@@ -48,7 +45,7 @@ module Embulk ...@@ -48,7 +45,7 @@ module Embulk
if record[3] == TRUE.to_s if record[3] == TRUE.to_s
write_mode = 'w' write_mode = 'w'
end end
dirname = File.dirname(data_set_directory + ref) dirname = File.dirname(file_path)
unless File.directory?(dirname) unless File.directory?(dirname)
FileUtils.mkdir_p(dirname) FileUtils.mkdir_p(dirname)
end end
......
...@@ -46,11 +46,12 @@ module Embulk ...@@ -46,11 +46,12 @@ module Embulk
hash = record[7] hash = record[7]
begin begin
if eof == DatasetUtils::DELETE if eof == DatasetUtils::DELETE
reference = [dataset, filename, extension].join("/") reference = [dataset, filename, extension].join(DatasetUtils::REFERENCE_SEPARATOR)
@wendelin.delete(reference) @wendelin.delete(reference)
else else
reference = [supplier, dataset, filename, extension, eof, size, hash].join("/") reference = [supplier, dataset, filename, extension, eof, size, hash].join(DatasetUtils::REFERENCE_SEPARATOR)
if not @wendelin.ingest(reference, data_chunk) split = eof != ""
if not @wendelin.ingest(reference, data_chunk, split)
raise "could not ingest" raise "could not ingest"
end end
end end
......
require_relative '../filelogger' require_relative '../filelogger'
require_relative '../dataset_utils'
class Index class Index
include Singleton include Singleton
...@@ -19,21 +20,20 @@ module Embulk ...@@ -19,21 +20,20 @@ module Embulk
class BinaryParserPlugin < ParserPlugin class BinaryParserPlugin < ParserPlugin
Plugin.register_parser("binary", self) Plugin.register_parser("binary", self)
CHUNK_SIZE = 50
MEGA = 1000000
EOF = "EOF"
def self.transaction(config, &control) def self.transaction(config, &control)
tool_dir = config.param('tool_dir', :string, default: ".") tool_dir = config.param('tool_dir', :string, default: ".")
@logger = LogManager.instance() @logger = LogManager.instance()
@logger.setFilename(tool_dir, "parser") @logger.setFilename(tool_dir, "parser")
task = { task = {
chunk_size: config.param('chunk_size', :float, default: CHUNK_SIZE) * MEGA, chunk_size: config.param('chunk_size', :float, default: 0) * DatasetUtils::MEGA,
supplier: config.param("supplier", :string, default: "parser"), supplier: config.param("supplier", :string, default: "parser"),
data_set: config.param("data_set", :string), data_set: config.param("data_set", :string),
input_plugin: config.param("storage", :string, default: "parser"), input_plugin: config.param("storage", :string, default: "parser"),
date: Time.now.strftime("%Y-%m-%d_%H-%M-%S") date: Time.now.strftime("%Y-%m-%d_%H-%M-%S")
} }
if task['chunk_size'] == 0
task['chunk_size'] = DatasetUtils::CHUNK_SIZE
end
columns = [ columns = [
Column.new(0, "supplier", :string), Column.new(0, "supplier", :string),
Column.new(1, "data_set", :string), Column.new(1, "data_set", :string),
...@@ -71,7 +71,7 @@ module Embulk ...@@ -71,7 +71,7 @@ module Embulk
end end
private private
def each_chunk(file, filename, chunk_size=CHUNK_SIZE) def each_chunk(file, filename, chunk_size=DatasetUtils::CHUNK_SIZE)
extension = @index.to_s.rjust(3, "0") extension = @index.to_s.rjust(3, "0")
npart = 0 npart = 0
next_byte = file.read(1) next_byte = file.read(1)
...@@ -89,7 +89,7 @@ module Embulk ...@@ -89,7 +89,7 @@ module Embulk
data += file.read(chunk_size) data += file.read(chunk_size)
next_byte = file.read(1) next_byte = file.read(1)
if not next_byte if not next_byte
eof = EOF eof = DatasetUtils::EOF
if first if first
# this means that the whole file will be ingested at once (not split) # this means that the whole file will be ingested at once (not split)
eof = "" eof = ""
......
...@@ -23,6 +23,9 @@ class WendelinClient ...@@ -23,6 +23,9 @@ class WendelinClient
rescue Exception => e rescue Exception => e
@logger.error("An error occurred while checking if reference exists: " + e.to_s) @logger.error("An error occurred while checking if reference exists: " + e.to_s)
@logger.error(e.backtrace) @logger.error(e.backtrace)
if e.to_s.include? "Unauthorized" or e.to_s.include? "401"
raise e
end
return FALSE return FALSE
else else
return res.to_s == 'TRUE' return res.to_s == 'TRUE'
...@@ -53,27 +56,27 @@ class WendelinClient ...@@ -53,27 +56,27 @@ class WendelinClient
end end
end end
def ingest(reference, data_chunk) def ingest(reference, data_chunk, split)
@logger.info("Ingestion reference: #{reference}", print=TRUE) @logger.info("Ingestion reference: #{reference}", print=TRUE)
if Time.new - @last_ingestion < 2 if split and Time.new - @last_ingestion < 3
# avoid send ingestions to close (specially for split ones) # avoid to send split ingestions to close
sleep 2 sleep 3
end end
if exists(reference) if exists(reference)
@logger.info("There is another ingestion already done for the pair data_set-filename. Reference "\ @logger.info("There is another ingestion already done for the pair dataset-filename. Reference "\
+ reference, print=TRUE) + reference, print=TRUE)
@logger.info("Rename your reference or delete the older ingestion.", print=TRUE) @logger.info("Rename your file or download the full dataset to make local changes.", print=TRUE)
return FALSE return FALSE
end end
if reference.include? "#" or reference.include? "+" if reference.include? "#" or reference.include? "+"
raise "Invalid chars in file name. Please rename it." raise "invalid chars in file name. Please rename it."
end end
begin begin
uri = URI("#{@erp5_url}/ingest?reference=#{reference}") uri = URI("#{@erp5_url}/ingest?reference=#{reference}")
rescue Exception => e rescue Exception => e
@logger.error("An error occurred while generating url: " + e.to_s) @logger.error("An error occurred while generating url: " + e.to_s)
@logger.error(e.backtrace) @logger.error(e.backtrace)
raise "Invalid chars in file name. Please rename it." raise "invalid chars in file name. Please rename it."
end end
response = handleRequest(uri, reference, data_chunk) response = handleRequest(uri, reference, data_chunk)
if response == FALSE if response == FALSE
...@@ -138,7 +141,7 @@ class WendelinClient ...@@ -138,7 +141,7 @@ class WendelinClient
res = Net::HTTP.start(uri.hostname, uri.port, res = Net::HTTP.start(uri.hostname, uri.port,
:use_ssl => (uri.scheme == 'https'), :use_ssl => (uri.scheme == 'https'),
:verify_mode => OpenSSL::SSL::VERIFY_NONE, :verify_mode => OpenSSL::SSL::VERIFY_NONE,
:ssl_timeout => 20, :open_timeout => 20, :read_timeout => 20, :ssl_timeout => 300, :open_timeout => 300, :read_timeout => 300,
) do |http| ) do |http|
http.request(req) http.request(req)
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment