ebulk: discard changes features

- --discard-changes parameter on command pull
- new help and example files
- several fixes
parent f7de6621
...@@ -4,6 +4,10 @@ DOWN_URL='https://softinst104003.host.vifib.net/erp5/' ...@@ -4,6 +4,10 @@ DOWN_URL='https://softinst104003.host.vifib.net/erp5/'
ING_URL='https://softinst104003.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk' ING_URL='https://softinst104003.host.vifib.net/erp5/portal_ingestion_policies/wendelin_embulk'
EBULK_DATA_PATH=~/.ebulk EBULK_DATA_PATH=~/.ebulk
EBULK_DATASET_FILE_NAME="/.ebulk_dataset"
DATASET_REPORT_FILE_NAME="/.dataset-task-report"
DATASET_COMPLETE_FILE_NAME="/.dataset-completed"
DISCARD_CHANGES_FILE_NAME="/.discard-changes"
LOG_DIR="$EBULK_DATA_PATH/logs" LOG_DIR="$EBULK_DATA_PATH/logs"
TOOL_PATH="$(dirname "$0")/ebulk-data" TOOL_PATH="$(dirname "$0")/ebulk-data"
DOWN_FILE="$EBULK_DATA_PATH/download-config.yml" DOWN_FILE="$EBULK_DATA_PATH/download-config.yml"
...@@ -46,6 +50,11 @@ function checkParameters { ...@@ -46,6 +50,11 @@ function checkParameters {
fi fi
if [ "$STORAGE" = "" ] ; then if [ "$STORAGE" = "" ] ; then
if [ ! -d "$DATASET_DIR" ]; then if [ ! -d "$DATASET_DIR" ]; then
if [ "$STATUS" ]; then
echo
echo -e "${ORANGE}[ERROR] ${GREEN}'$DATASET_DIR'${ORANGE} is not a dataset directory.${NC}"
echo >&2; return 1
fi
echo echo
mkdir "$DATASET_DIR" 2>/dev/null mkdir "$DATASET_DIR" 2>/dev/null
if [ ! $? -eq 0 ]; then if [ ! $? -eq 0 ]; then
...@@ -56,16 +65,29 @@ function checkParameters { ...@@ -56,16 +65,29 @@ function checkParameters {
helpReadme >&2; return 1 helpReadme >&2; return 1
fi fi
fi fi
EBULK_DATASET_FILE="$DATASET_DIR/.ebulk_dataset" EBULK_DATASET_FILE="$DATASET_DIR$EBULK_DATASET_FILE_NAME"
if [[ $DATASET_DIR != $REFERENCE ]]; then if [[ $DATASET_DIR != $REFERENCE ]]; then
if [ "$REFERENCE" = "." ] ; then if [ "$REFERENCE" = "." ] ; then
REFERENCE=$(basename "$DATASET_DIR") REFERENCE=$(basename "$DATASET_DIR")
fi fi
DATA_SET=$REFERENCE DATA_SET=$REFERENCE
if [ -f "$EBULK_DATASET_FILE" ]; then
PREVIOUS_DATA_SET=$(cat "$EBULK_DATASET_FILE" 2>/dev/null)
if [[ "$PREVIOUS_DATA_SET" != "$REFERENCE" ]]; then
DATASET_REPORT_FILE="$DATASET_DIR$DATASET_REPORT_FILE_NAME"
if [ -f "$DATASET_REPORT_FILE" ]; then
rm -f ${DATASET_REPORT_FILE}
fi
DATASET_COMPLETE_FILE="$DATASET_DIR$DATASET_COMPLETE_FILE_NAME"
if [ -f "$DATASET_COMPLETE_FILE" ]; then
rm -f ${DATASET_COMPLETE_FILE}
fi
fi
fi
echo $REFERENCE > "$EBULK_DATASET_FILE" 2>/dev/null echo $REFERENCE > "$EBULK_DATASET_FILE" 2>/dev/null
else else
if [ -f "$EBULK_DATASET_FILE" ]; then if [ -f "$EBULK_DATASET_FILE" ]; then
DATA_SET=$(cat "$DATASET_DIR/.ebulk_dataset" 2>/dev/null) DATA_SET=$(cat "$EBULK_DATASET_FILE" 2>/dev/null)
else else
DATA_SET=$(basename "$DATASET_DIR") DATA_SET=$(basename "$DATASET_DIR")
if [ "$DATA_SET" != "." ] ; then if [ "$DATA_SET" != "." ] ; then
...@@ -403,7 +425,7 @@ function askS3parameters { ...@@ -403,7 +425,7 @@ function askS3parameters {
} }
function stage { function stage {
EBULK_DATASET_FILE="./.ebulk_dataset" EBULK_DATASET_FILE=".$EBULK_DATASET_FILE_NAME"
if [ ! -f "$EBULK_DATASET_FILE" ]; then if [ ! -f "$EBULK_DATASET_FILE" ]; then
echo echo
echo -e "${ORANGE}[ERROR] You are not in a dataset directory." echo -e "${ORANGE}[ERROR] You are not in a dataset directory."
...@@ -461,6 +483,8 @@ while [ "$1" != "" ]; do ...@@ -461,6 +483,8 @@ while [ "$1" != "" ]; do
;; ;;
-a | --advanced ) ADVANCED=true -a | --advanced ) ADVANCED=true
;; ;;
-dc | --discard-changes ) DISCARD_CHANGES=true
;;
-c | --chunk ) shift -c | --chunk ) shift
CHUNK=$1 CHUNK=$1
;; ;;
...@@ -490,7 +514,7 @@ while [ "$1" != "" ]; do ...@@ -490,7 +514,7 @@ while [ "$1" != "" ]; do
shift shift
done done
for ELEMENT in '' '-d' '--directory' '-s' '--storage' '-cs' '--custom-storage' '-a' '--advanced' '-c' '--chunk'; do for ELEMENT in '' '-d' '--directory' '-s' '--storage' '-cs' '--custom-storage' '-a' '--advanced' '-c' '--chunk' '-dc' '--discard-changes'; do
if [ "$ELEMENT" = "$REFERENCE" ]; then if [ "$ELEMENT" = "$REFERENCE" ]; then
REFERENCE="." REFERENCE="."
fi fi
...@@ -554,7 +578,13 @@ case $OPERATION in ...@@ -554,7 +578,13 @@ case $OPERATION in
fi fi
echo "### DATASET DOWNLOAD ###" echo "### DATASET DOWNLOAD ###"
echo echo
echo -e "** The dataset will be downloaded in the specified directory: $DATASET_DIR" if [ "$DISCARD_CHANGES" != "" ] ; then
DISCARD_CHANGES_FILE="$DATASET_DIR$DISCARD_CHANGES_FILE_NAME"
touch "$DISCARD_CHANGES_FILE" 2>/dev/null
echo -e "** Discard all local changes in directory: $DATASET_DIR"
else
echo -e "** The dataset will be downloaded in the specified directory: $DATASET_DIR"
fi
echo echo
read -n 1 -s -r -p "Press any key to continue" read -n 1 -s -r -p "Press any key to continue"
echo echo
...@@ -588,8 +618,9 @@ case $OPERATION in ...@@ -588,8 +618,9 @@ case $OPERATION in
PARAMETER_FUNCTION=askFTPparameters PARAMETER_FUNCTION=askFTPparameters
STORAGE_GEM=embulk-input-ftp STORAGE_GEM=embulk-input-ftp
;; ;;
*) echo -e "${ORANGE}[ERROR] '$STORAGE' storage is not available in ebulk tool yet.${NC}" *) echo -e "${ORANGE}[ERROR] '$STORAGE' storage is not available in ebulk tool yet or it is not a valid storage.${NC}"
echo "[INFO] If you want to configure yourself this storage, you can run the tool with parameter --custom-storage" echo "[INFO] If you want to configure yourself this storage, you can run the tool with parameter --custom-storage"
echo "[INFO] Current Ebulk version has the following storages available: ftp, http, s3."
echo echo
exit exit
esac esac
......
...@@ -11,6 +11,7 @@ in: ...@@ -11,6 +11,7 @@ in:
user: $USER user: $USER
password: $pwd password: $pwd
tool_dir: $TOOL_DIR tool_dir: $TOOL_DIR
status: $STATUS
out: out:
type: wendelin type: wendelin
......
...@@ -11,15 +11,18 @@ class DatasetUtils ...@@ -11,15 +11,18 @@ class DatasetUtils
RESUME_OPERATION_FILE = ".resume-operation" RESUME_OPERATION_FILE = ".resume-operation"
INITIAL_INGESTION_FILE = ".initial-ingestion" INITIAL_INGESTION_FILE = ".initial-ingestion"
STAGED_FILE = ".staged" STAGED_FILE = ".staged"
DISCARD_CHANGES_FILE = ".discard-changes"
RUN_DONE = "done" RUN_DONE = "done"
RUN_ERROR = "error" RUN_ERROR = "error"
RUN_ABORTED = "aborted" RUN_ABORTED = "aborted"
DELETE = "DELETE" DELETE = "DELETE"
RENAME = "RENAME"
INGESTION = "ingestion" INGESTION = "ingestion"
ADD = "add" ADD = "add"
REMOVE = "remove" REMOVE = "remove"
STATUS_NEW = "new" STATUS_NEW = "new"
STATUS_RENAMED = "renamed"
STATUS_MODIFIED = "modified" STATUS_MODIFIED = "modified"
STATUS_DELETED = "deleted" STATUS_DELETED = "deleted"
STAGE_ADD="add" STAGE_ADD="add"
...@@ -31,6 +34,7 @@ class DatasetUtils ...@@ -31,6 +34,7 @@ class DatasetUtils
OVERWRITE = "overwrite: " OVERWRITE = "overwrite: "
OUTPUT_MODIFIED = "modified: " OUTPUT_MODIFIED = "modified: "
OUTPUT_DELETED = "deleted: " OUTPUT_DELETED = "deleted: "
OUTPUT_RENAMED = "renamed: "
MEGA = 1000000 MEGA = 1000000
EOF = "EOF" EOF = "EOF"
...@@ -38,6 +42,7 @@ class DatasetUtils ...@@ -38,6 +42,7 @@ class DatasetUtils
NONE_EXT = "none" NONE_EXT = "none"
REFERENCE_SEPARATOR = "/" REFERENCE_SEPARATOR = "/"
RECORD_SEPARATOR = ";" RECORD_SEPARATOR = ";"
DATE_FORMAT = "%Y-%m-%d-%H-%M-%S"
def initialize(data_set_directory) def initialize(data_set_directory)
@data_set_directory = data_set_directory @data_set_directory = data_set_directory
...@@ -48,25 +53,27 @@ class DatasetUtils ...@@ -48,25 +53,27 @@ class DatasetUtils
@resume_operation_file = @data_set_directory + RESUME_OPERATION_FILE @resume_operation_file = @data_set_directory + RESUME_OPERATION_FILE
@initial_ingestion_file = @data_set_directory + INITIAL_INGESTION_FILE @initial_ingestion_file = @data_set_directory + INITIAL_INGESTION_FILE
@staged_file = @data_set_directory + STAGED_FILE @staged_file = @data_set_directory + STAGED_FILE
@discard_changes_file = @data_set_directory + DISCARD_CHANGES_FILE
end end
def getLocalPaths(paths) def getLocalPaths(paths)
return paths.map {|path| return paths.map {|path|
next [] unless Dir.exist?(path) next [] unless Dir.exist?(path)
Dir[(path + '/**/*').gsub! '//', '/'] Dir[(path + '/**/*').gsub! '//', '/']
}.flatten.select{ |file| File.file?(file) } }.flatten.select{ |file| File.file?(file) }
end end
def getLocalFiles(remove=nil) def getLocalFiles(remove=nil)
local_files = {} local_files = {}
begin begin
File.readlines(@task_report_file).each do |line| File.readlines(@task_report_file).each do |line|
record = line.split(RECORD_SEPARATOR) record = line.split(RECORD_SEPARATOR)
if record[1].chomp == RUN_DONE if record[1].chomp == RUN_DONE
if (remove.nil?) || (remove != record[0]) if (remove.nil?) || (remove != record[0])
local_files[record[0]] = {"size" => record[2].chomp, "hash" => record[3].chomp, "status" => record[1].chomp, "modification_date" => record[4].chomp } local_files[record[0]] = {"size" => record[2].chomp, "hash" => record[3].chomp,
"status" => record[1].chomp, "modification_date" => record[4].chomp }
end end
end end
end end
rescue Exception => e rescue Exception => e
@logger.error("An error occurred in DatasetUtils method 'getLocalFiles':" + e.to_s) @logger.error("An error occurred in DatasetUtils method 'getLocalFiles':" + e.to_s)
...@@ -79,11 +86,12 @@ class DatasetUtils ...@@ -79,11 +86,12 @@ class DatasetUtils
begin begin
File.delete(@temp_report_file) if File.exist?(@temp_report_file) File.delete(@temp_report_file) if File.exist?(@temp_report_file)
if local_files.empty? if local_files.empty?
File.open(@temp_report_file, 'w') {} File.open(@temp_report_file, 'w') {}
else else
local_files.each do |key, array| local_files.each do |key, array|
File.open(@temp_report_file, 'ab') { |file| file.puts(key+RECORD_SEPARATOR+array["status"]+RECORD_SEPARATOR+array["size"].to_s+RECORD_SEPARATOR+array["hash"]+RECORD_SEPARATOR+array["modification_date"]) } record = [key, array["status"], array["size"].to_s, array["hash"], array["modification_date"]].join(RECORD_SEPARATOR)
end File.open(@temp_report_file, 'ab') { |file| file.puts(record) }
end
end end
FileUtils.cp_r(@temp_report_file, @task_report_file, :remove_destination => true) FileUtils.cp_r(@temp_report_file, @task_report_file, :remove_destination => true)
rescue Exception => e rescue Exception => e
...@@ -98,43 +106,47 @@ class DatasetUtils ...@@ -98,43 +106,47 @@ class DatasetUtils
end end
end end
def saveCurrentOperation(operation, reference) def saveCurrentOperation(operation, reference, new_reference)
if File.exist?(@resume_operation_file) if File.exist?(@resume_operation_file)
File.delete(@resume_operation_file) File.delete(@resume_operation_file)
end end
File.open(@resume_operation_file, 'w') { |file| file.puts(operation+RECORD_SEPARATOR+reference) } record = new_reference ? [operation, reference, new_reference].join(RECORD_SEPARATOR) : [operation, reference].join(RECORD_SEPARATOR)
File.open(@resume_operation_file, 'w') { |file| file.puts(record) }
end end
def reportUpToDate(data_stream_dict) def reportUpToDate(data_stream_dict, data_set)
begin begin
if not reportFileExist() and not completedFileExist() # directory never downloaded -new or used for partial ingestions-
# directory never downloaded -new or used for partial ingestions- return TRUE if not reportFileExist() and not completedFileExist()
return TRUE # download not finished
end return FALSE if reportFileExist() and not completedFileExist()
if reportFileExist() and not completedFileExist() return FALSE if data_stream_dict["status_code"] == 2
# download not finished return TRUE if data_stream_dict["status_code"] != 0
return FALSE return TRUE if data_stream_dict["result"].empty?
end changes = getRemoteChangedDataStreams(data_stream_dict["result"], data_set)
if data_stream_dict["status_code"] == 2
return FALSE
end
if data_stream_dict["status_code"] != 0
return TRUE
end
changes = getRemoteChangedDataStreams(data_stream_dict["result"])
if changes.empty? if changes.empty?
return TRUE return TRUE
elsif changes.length == 1 elsif changes.length == 1
# check if the unique detected change corresponds to an interrumped ingestion # check if the unique detected change corresponds to an interrupted ingestion
if File.exist?(@resume_operation_file) if File.exist?(@resume_operation_file)
operation=File.open(@resume_operation_file).read.chomp.split(RECORD_SEPARATOR) operation=File.open(@resume_operation_file).read.chomp.split(RECORD_SEPARATOR)
if operation[0] == INGESTION if operation[1] == changes[0]["reference"]
if operation[1] == changes[0]["reference"] @logger.info("File '#{operation[1]}' was detected as a change in remote dataset, but it is a false positive.")
File.delete(@resume_operation_file) @logger.info("That file operation ingestion was interrupted right after successfully ingest it but local report is outdated.")
return TRUE @logger.info("Dataset is up to date, file operation was reported as ingested in local report.", print=TRUE)
new_reference = operation[0] == RENAME ? changes[0]["new_reference"] : FALSE
if new_reference
file_path = referenceToPath(new_reference, @data_set_directory, data_set)
else
file_path = referenceToPath(changes[0]["reference"], @data_set_directory, data_set)
end end
size = changes[0]["size"]
hash = changes[0]["hash"]
addToReport(changes[0]["reference"], RUN_DONE, size, hash, data_set, new_reference)
File.delete(@resume_operation_file)
return TRUE
end end
end end
end end
return FALSE return FALSE
rescue Exception => e rescue Exception => e
...@@ -147,33 +159,36 @@ class DatasetUtils ...@@ -147,33 +159,36 @@ class DatasetUtils
def showChanges(changes, status) def showChanges(changes, status)
changes.each do |change| changes.each do |change|
if status != "" if status != ""
status_output = status status_output = status
elsif change["status"] == STATUS_NEW elsif change["status"] == STATUS_NEW
status_output = OUTPUT_NEW status_output = OUTPUT_NEW
elsif change["status"] == STATUS_MODIFIED elsif change["status"] == STATUS_MODIFIED
status_output = OUTPUT_MODIFIED status_output = OUTPUT_MODIFIED
elsif change["status"] == STATUS_DELETED elsif change["status"] == STATUS_DELETED
status_output = OUTPUT_DELETED status_output = OUTPUT_DELETED
elsif change["status"] == STATUS_RENAMED
status_output = OUTPUT_RENAMED
else else
status_output = "no-status" status_output = "change: "
end end
path = status != OVERWRITE ? change["path"] : change path = status != OVERWRITE ? change["path"] : change
@logger.info(" #{status_output}#{path}", print=TRUE) new_path = change["status"] == STATUS_RENAMED ? " --> #{change["new_path"]}" : ""
@logger.info(" #{status_output}#{path}#{new_path}", print=TRUE)
end end
end end
def showChangesList(changes, message, print_short, status="") def showChangesList(changes, message, print_short, status="")
if not changes.empty? if not changes.empty?
if message and message != "" if message and message != ""
@logger.info(message, print=TRUE) @logger.info(message, print=TRUE)
end end
if print_short and changes.length > 200 if print_short and changes.length > 200
limit = changes.length > 300 ? 100 : changes.length/3 limit = changes.length > 300 ? 100 : changes.length/3
showChanges(changes[0, limit], status) showChanges(changes[0, limit], status)
puts "....." puts "....."
showChanges(changes[changes.length-limit, changes.length-1], status) showChanges(changes[changes.length-limit, changes.length-1], status)
else else
showChanges(changes, status) showChanges(changes, status)
end end
end end
end end
...@@ -206,6 +221,14 @@ class DatasetUtils ...@@ -206,6 +221,14 @@ class DatasetUtils
puts puts
end end
def deleteDiscardChangesFile()
File.delete(@discard_changes_file) if File.exist?(@discard_changes_file)
end
def discardChangesFileExist()
return File.exist?(@discard_changes_file)
end
def deleteCompletedFile() def deleteCompletedFile()
File.delete(@completed_file) if File.exist?(@completed_file) File.delete(@completed_file) if File.exist?(@completed_file)
end end
...@@ -223,7 +246,13 @@ class DatasetUtils ...@@ -223,7 +246,13 @@ class DatasetUtils
end end
def reportFileExist() def reportFileExist()
return File.exist?(@task_report_file) return TRUE if File.exist?(@task_report_file)
if File.exist?(@temp_report_file)
FileUtils.cp_r(@temp_report_file, @task_report_file)
return TRUE
else
return FALSE
end
end end
def deleteInitialIngestionFile() def deleteInitialIngestionFile()
...@@ -272,26 +301,34 @@ class DatasetUtils ...@@ -272,26 +301,34 @@ class DatasetUtils
return filename, extension, reference return filename, extension, reference
end end
def addToReport(reference, status, size, hash, data_set) def addToReport(reference, status, size, hash, data_set, new_reference=FALSE)
local_files = {} local_files = {}
begin begin
file_path = referenceToPath(reference, @data_set_directory, data_set) file_path = referenceToPath(reference, @data_set_directory, data_set)
modification_date = File.exist?(file_path) ? File.mtime(file_path).strftime("%Y-%m-%d-%H-%M-%S") : "not-modification-date" modification_date = File.exist?(file_path) ? File.mtime(file_path).strftime(DATE_FORMAT) : "not-modification-date"
if not reportFileExist() if not reportFileExist()
File.open(@task_report_file, 'w') {} File.open(@task_report_file, 'w') {}
end end
new_file = TRUE new_file = TRUE
File.readlines(@task_report_file).each do |line| File.readlines(@task_report_file).each do |line|
record = line.split(RECORD_SEPARATOR) record = line.split(RECORD_SEPARATOR)
if reference.to_s == record[0].to_s if reference.to_s == record[0].to_s
local_files[reference] = {"size" => size, "hash" => hash, "status" => status, "modification_date" => modification_date } if new_reference
reference = new_reference
file_path = referenceToPath(reference, @data_set_directory, data_set)
modification_date = File.exist?(file_path) ? File.mtime(file_path).strftime(DATE_FORMAT) : "not-modification-date"
end
local_files[reference] = {"size" => size, "hash" => hash, "status" => status,
"modification_date" => modification_date }
new_file = FALSE new_file = FALSE
else else
local_files[record[0]] = {"size" => record[2].chomp, "hash" => record[3].chomp, "status" => record[1].chomp, "modification_date" => record[4].chomp } local_files[record[0]] = {"size" => record[2].chomp, "hash" => record[3].chomp,
end "status" => record[1].chomp, "modification_date" => record[4].chomp }
end
end end
if new_file if new_file
local_files[reference] = {"size" => size, "hash" => hash, "status" => status, "modification_date" => modification_date } local_files[reference] = {"size" => size, "hash" => hash, "status" => status,
"modification_date" => modification_date }
end end
rescue Exception => e rescue Exception => e
@logger.error("An error occurred in DatasetUtils method 'addToReport':" + e.to_s) @logger.error("An error occurred in DatasetUtils method 'addToReport':" + e.to_s)
...@@ -306,13 +343,14 @@ class DatasetUtils ...@@ -306,13 +343,14 @@ class DatasetUtils
end end
def getHash(file) def getHash(file)
return "FILE-NOT-EXISTS" if ! File.exist?(file)
begin begin
chunk_size = 4 * MEGA chunk_size = 4 * MEGA
md5 = Digest::MD5.new md5 = Digest::MD5.new
open(file) do |f| open(file) do |f|
while chunk=f.read(chunk_size) while chunk=f.read(chunk_size)
md5.update(chunk) md5.update(chunk)
end end
end end
return md5.hexdigest return md5.hexdigest
rescue Exception => e rescue Exception => e
...@@ -338,7 +376,7 @@ class DatasetUtils ...@@ -338,7 +376,7 @@ class DatasetUtils
data_streams.each do |data_stream| data_streams.each do |data_stream|
file_path = referenceToPath(data_stream["reference"], @data_set_directory, dataset) file_path = referenceToPath(data_stream["reference"], @data_set_directory, dataset)
if files.include? file_path if files.include? file_path
conflicts.push(file_path.sub(@data_set_directory, "./")) conflicts.push(file_path.sub(@data_set_directory, "./"))
end end
end end
return conflicts return conflicts
...@@ -369,6 +407,21 @@ class DatasetUtils ...@@ -369,6 +407,21 @@ class DatasetUtils
end end
end end
def isRenamed(file, file_dict_list, file_dict=FALSE)
hash = file_dict ? file_dict["hash"] : ""
size = file_dict ? file_dict["size"] : (File.size(file).to_s if File.exist?(file))
file_dict_list.each do |path, dict|
if size == dict["size"].to_s
hash = hash != "" ? hash : getHash(file).to_s
if hash == dict["hash"]
old_path = path
return {"key" => old_path, "size" => size, "hash" => hash}
end
end
end
return FALSE
end
def isStaged(path, staged_dict, status) def isStaged(path, staged_dict, status)
return FALSE if staged_dict.nil? || staged_dict.empty? return FALSE if staged_dict.nil? || staged_dict.empty?
staged_status = {"index" => -1, "status" => ""} staged_status = {"index" => -1, "status" => ""}
...@@ -389,40 +442,53 @@ class DatasetUtils ...@@ -389,40 +442,53 @@ class DatasetUtils
return staged_status["status"] == status return staged_status["status"] == status
end end
def checkRenamed(path, deleted_files, change)
renamed_dict = isRenamed(path, deleted_files)
if renamed_dict
deleted_files.delete(renamed_dict["key"])
change["status"] = STATUS_RENAMED
change["new_path"] = change["path"]
change["path"] = renamed_dict["key"]
change["size"] = renamed_dict["size"]
change["hash"] = renamed_dict["hash"]
end
return change
end
def getLocalChanges(files, data_set, staged, partial_ingestion=FALSE) def getLocalChanges(files, data_set, staged, partial_ingestion=FALSE)
staged_changes, untracked_changes = [], [] staged_changes, untracked_changes = [], []
staged_dict = getStagedRecords() if staged staged_dict = getStagedRecords() if staged
deleted_files = {}
begin begin
if reportFileExist() if reportFileExist()
File.readlines(@task_report_file).each do |line| File.readlines(@task_report_file).each do |line|
record = line.split(RECORD_SEPARATOR) record = line.split(RECORD_SEPARATOR)
if record[1].chomp == RUN_DONE if record[1].chomp == RUN_DONE
file_path = referenceToPath(record[0], @data_set_directory, data_set) file_path = referenceToPath(record[0], @data_set_directory, data_set)
if files.include? file_path if files.include? file_path
modification_date = File.mtime(file_path).strftime("%Y-%m-%d-%H-%M-%S") modification_date = File.mtime(file_path).strftime(DATE_FORMAT)
if staged && isStaged(file_path, staged_dict, STAGE_REMOVE) if staged && isStaged(file_path, staged_dict, STAGE_REMOVE)
staged_changes.push({"path" => file_path, "size" => "", "hash" => DELETE, "status" => STATUS_DELETED }) staged_changes.push({"path" => file_path, "size" => "", "hash" => DELETE, "status" => STATUS_DELETED })
elsif modification_date != record[4].chomp elsif modification_date != record[4].chomp
size = File.size(file_path).to_s size = File.size(file_path).to_s
hash = getHash(file_path).to_s hash = getHash(file_path).to_s
change = {"path" => file_path, "size" => size, "hash" => hash, "status" => STATUS_MODIFIED } change = {"path" => file_path, "size" => size, "hash" => hash, "status" => STATUS_MODIFIED }
if size == record[2].to_s if size == record[2].to_s
if hash != record[3].chomp if hash != record[3].chomp
staged && isStaged(file_path, staged_dict, STAGE_ADD) ? staged_changes.push(change) : untracked_changes.push(change) staged && isStaged(file_path, staged_dict, STAGE_ADD) ? staged_changes.push(change) : untracked_changes.push(change)
end end
else else
staged && isStaged(file_path, staged_dict, STAGE_ADD) ? staged_changes.push(change) : untracked_changes.push(change) staged && isStaged(file_path, staged_dict, STAGE_ADD) ? staged_changes.push(change) : untracked_changes.push(change)
end end
end end
files.delete(file_path) files.delete(file_path)
else else
if not partial_ingestion if not partial_ingestion
change = {"path" => file_path, "size" => "", "hash" => DELETE, "status" => STATUS_DELETED } deleted_files[file_path] = {"size" => record[2].to_s, "hash" => record[3].chomp}
staged && isStaged(file_path, staged_dict, STAGE_REMOVE) ? staged_changes.push(change) : untracked_changes.push(change)
end end
end end
end end
end end
end end
untrucked_deletions = [] untrucked_deletions = []
files.each do |path| files.each do |path|
...@@ -433,6 +499,7 @@ class DatasetUtils ...@@ -433,6 +499,7 @@ class DatasetUtils
if File.exist?(path) if File.exist?(path)
# check scenario where new files were created inside a directory staged as deleted # check scenario where new files were created inside a directory staged as deleted
if File.mtime(path) > File.mtime(@staged_file) if File.mtime(path) > File.mtime(@staged_file)
change = checkRenamed(path, deleted_files, change)
untracked_changes.push(change) untracked_changes.push(change)
else else
File.delete(path) File.delete(path)
...@@ -441,13 +508,19 @@ class DatasetUtils ...@@ -441,13 +508,19 @@ class DatasetUtils
end end
untrucked_deletions.push(path) untrucked_deletions.push(path)
else else
change = checkRenamed(path, deleted_files, change)
isStaged(path, staged_dict, STAGE_ADD) ? staged_changes.push(change) : untracked_changes.push(change) isStaged(path, staged_dict, STAGE_ADD) ? staged_changes.push(change) : untracked_changes.push(change)
end end
else else
change = checkRenamed(path, deleted_files, change)
untracked_changes.push(change) untracked_changes.push(change)
end end
end end
updateStagedDeletions(untrucked_deletions) deleted_files.each do |deleted_path, dict|
change = {"path" => deleted_path, "size" => "", "hash" => DELETE, "status" => STATUS_DELETED }
staged && isStaged(deleted_path, staged_dict, STAGE_REMOVE) ? staged_changes.push(change) : untracked_changes.push(change)
end
updateStagedDeletions(untrucked_deletions) if staged
rescue Exception => e rescue Exception => e
@logger.error("An error occurred in DatasetUtils method 'getLocalChanges':" + e.to_s) @logger.error("An error occurred in DatasetUtils method 'getLocalChanges':" + e.to_s)
@logger.error(e.backtrace) @logger.error(e.backtrace)
...@@ -462,8 +535,54 @@ class DatasetUtils ...@@ -462,8 +535,54 @@ class DatasetUtils
return TRUE return TRUE
end end
def getRemoteChangedDataStreams(data_streams) def getRemoteFileListForDiscardLocalChanges(data_streams, data_set, check_changes=FALSE, changes=[])
pending_data_streams = [] remotes_for_discard_local = []
if changes.empty?
paths = [appendSlashTo(@data_set_directory)]
local_files = getLocalPaths(paths)
ignore, local_changes = getLocalChanges(local_files, data_set, staged=FALSE)
else
local_changes = changes
end
return local_changes if check_changes
local_changes.each do |change|
f, e, reference = getPathInfo(change["path"], data_set)
case change["status"]
when STATUS_NEW
@logger.info("Discarding local change on '#{change["path"]}'", print=TRUE)
@logger.info("New file removed.", print=TRUE)
puts
File.delete(change["path"]) if File.exist?(change["path"])
when STATUS_RENAMED
@logger.info("Discarding local change on '#{change["new_path"]}'", print=TRUE)
@logger.info("File moved/renamed back to '#{change["path"]}'.", print=TRUE)
puts
new_reference = reference
f, e, reference = getPathInfo(change["new_path"], data_set)
unless File.directory?(File.dirname(change["path"]))
FileUtils.mkdir_p(File.dirname(change["path"]))
end
FileUtils.mv(change["new_path"], change["path"]) if File.exist?(change["new_path"])
when STATUS_MODIFIED
data_stream = data_streams.find{|data_stream| data_stream["reference"] == reference }
if data_stream
data_stream["path"] = change["path"]
remotes_for_discard_local.push(data_stream)
end
when STATUS_DELETED
data_stream = data_streams.find{|data_stream| data_stream["reference"] == reference }
if data_stream
data_stream["path"] = change["path"]
remotes_for_discard_local.push(data_stream)
end
end
end
return remotes_for_discard_local
end
def getRemoteChangedDataStreams(data_streams, data_set)
changed_data_streams = []
new_changed_files = {}
begin begin
if reportFileExist() if reportFileExist()
local_files = {} local_files = {}
...@@ -489,19 +608,32 @@ class DatasetUtils ...@@ -489,19 +608,32 @@ class DatasetUtils
end end
if pending if pending
local_files.delete(reference) local_files.delete(reference)
pending_data_streams.push(data_stream) file_path = referenceToPath(data_stream["reference"], @data_set_directory, data_set)
new_changed_files[file_path] = data_stream
end end
end end
local_files.each do |key, array| local_files.each do |reference, file_dict|
if not remote_files.include? key if not remote_files.include? reference
pending_data_streams.push({"reference" => key, "hash" => DELETE }) changed_data_stream = {"reference" => reference, "hash" => DELETE }
file_path = referenceToPath(reference, @data_set_directory, data_set)
renamed_dict = isRenamed(file_path, new_changed_files, file_dict)
if renamed_dict
changed_data_stream = {"reference" => reference, "id" => new_changed_files[renamed_dict["key"]]["id"],
"new_reference" => new_changed_files[renamed_dict["key"]]["reference"], "status" => STATUS_RENAMED,
"size" => renamed_dict["size"], "hash" => renamed_dict["hash"] }
new_changed_files.delete(renamed_dict["key"])
end
changed_data_streams.push(changed_data_stream)
end end
end end
new_changed_files.each do |path, data_stream|
changed_data_streams.push(data_stream)
end
end end
rescue Exception => e rescue Exception => e
@logger.error("An error occurred in DatasetUtils method 'getRemoteChangedDataStreams':" + e.to_s) @logger.error("An error occurred in DatasetUtils method 'getRemoteChangedDataStreams':" + e.to_s)
@logger.error(e.backtrace) @logger.error(e.backtrace)
end end
return pending_data_streams return changed_data_streams
end end
end end
...@@ -19,7 +19,7 @@ module Embulk ...@@ -19,7 +19,7 @@ module Embulk
{"name"=>"eof", "type"=>"string"}, {"name"=>"eof", "type"=>"string"},
{"name"=>"size", "type"=>"string"}, {"name"=>"size", "type"=>"string"},
{"name"=>"hash", "type"=>"string"} {"name"=>"hash", "type"=>"string"}
] ]
def self.status(task, push=FALSE) def self.status(task, push=FALSE)
partial_ingestion = @dataset_utils.initialIngestionFileExist() partial_ingestion = @dataset_utils.initialIngestionFileExist()
...@@ -94,6 +94,7 @@ module Embulk ...@@ -94,6 +94,7 @@ module Embulk
@dataset_utils = DatasetUtils.new(@data_set_directory) @dataset_utils = DatasetUtils.new(@data_set_directory)
@status = config.param('status', :string, default: FALSE) @status = config.param('status', :string, default: FALSE)
@status = @status == "" ? FALSE : @status @status = @status == "" ? FALSE : @status
@dataset_utils.deleteDiscardChangesFile()
if @status if @status
if not @dataset_utils.initialIngestionFileExist() if not @dataset_utils.initialIngestionFileExist()
if not @dataset_utils.reportFileExist() if not @dataset_utils.reportFileExist()
...@@ -102,7 +103,7 @@ module Embulk ...@@ -102,7 +103,7 @@ module Embulk
@logger.abortExecution() @logger.abortExecution()
elsif not @dataset_utils.completedFileExist() elsif not @dataset_utils.completedFileExist()
puts puts
@logger.error("There is an interrumped download operation in dataset directory. Please resume the download first.", print=TRUE) @logger.error("There is an interrupted download operation in dataset directory. Please resume the download first.", print=TRUE)
@logger.abortExecution() @logger.abortExecution()
end end
end end
...@@ -120,7 +121,7 @@ module Embulk ...@@ -120,7 +121,7 @@ module Embulk
if data_stream_dict["status_code"] != 0 if data_stream_dict["status_code"] != 0
@logger.error(data_stream_dict["error_message"], print=TRUE) @logger.error(data_stream_dict["error_message"], print=TRUE)
@logger.abortExecution() @logger.abortExecution()
end end
task['data_streams'] = data_stream_dict["result"] task['data_streams'] = data_stream_dict["result"]
if not @dataset_utils.reportFileExist() if not @dataset_utils.reportFileExist()
...@@ -128,11 +129,11 @@ module Embulk ...@@ -128,11 +129,11 @@ module Embulk
else else
if not @dataset_utils.initialIngestionFileExist() if not @dataset_utils.initialIngestionFileExist()
@logger.info("Checking local dataset...", print=TRUE) @logger.info("Checking local dataset...", print=TRUE)
if not @dataset_utils.reportUpToDate(data_stream_dict) if not @dataset_utils.reportUpToDate(data_stream_dict, @data_set)
puts puts
@logger.error("Your current dataset is outdated. Please, run a download to update it before ingest your changes.", print=TRUE) @logger.error("Your current dataset is outdated. Please, run a download to update it before ingest your changes.", print=TRUE)
puts puts
@logger.abortExecution(error=FALSE) @logger.abortExecution(error=FALSE)
end end
end end
end end
...@@ -145,12 +146,12 @@ module Embulk ...@@ -145,12 +146,12 @@ module Embulk
@logger.error("Could not find any valid file.", print=TRUE) @logger.error("Could not find any valid file.", print=TRUE)
@logger.error("Please make sure your dataset directory contains files for ingestion.", print=TRUE) @logger.error("Please make sure your dataset directory contains files for ingestion.", print=TRUE)
@logger.abortExecution() @logger.abortExecution()
end end
self.status(task, push=TRUE) self.status(task, push=TRUE)
@logger.info("Continue with ingestion? (y/n)", print=TRUE) @logger.info("Continue with ingestion? (y/n)", print=TRUE)
option = gets option = gets
option = option.chomp option = option.chomp
if option == "n" if option == "n"
@logger.info("Ingestion cancelled by user.", print=TRUE) @logger.info("Ingestion cancelled by user.", print=TRUE)
@logger.abortExecution() @logger.abortExecution()
...@@ -160,20 +161,20 @@ module Embulk ...@@ -160,20 +161,20 @@ module Embulk
end end
columns = [ columns = [
Column.new(0, "supplier", :string), Column.new(0, "supplier", :string),
Column.new(1, "data_set", :string), Column.new(1, "data_set", :string),
Column.new(2, "file", :string), Column.new(2, "file", :string),
Column.new(3, "extension", :string), Column.new(3, "extension", :string),
Column.new(4, "data_chunk", :string), Column.new(4, "data_chunk", :string),
Column.new(5, "eof", :string), Column.new(5, "eof", :string),
Column.new(6, "size", :string), Column.new(6, "size", :string),
Column.new(7, "hash", :string) Column.new(7, "hash", :string)
] ]
commit_reports = yield(task, columns, task['paths'].length) commit_reports = yield(task, columns, task['paths'].length)
done = commit_reports.map{|hash| hash["done"]}.flatten.compact done = commit_reports.map{|hash| hash["done"]}.flatten.compact
resume(task, columns, task['paths'].length, &control) resume(task, columns, task['paths'].length, &control)
rescue Exception => e rescue Exception => e
@logger.error("An error occurred during operation: " + e.to_s, print=TRUE) @logger.error("An error occurred during operation: " + e.to_s, print=TRUE)
@logger.error(e.backtrace) @logger.error(e.backtrace)
...@@ -184,9 +185,9 @@ module Embulk ...@@ -184,9 +185,9 @@ module Embulk
def self.resume(task, columns, count, &control) def self.resume(task, columns, count, &control)
@logger = LogManager.instance() @logger = LogManager.instance()
task_reports = yield(task, columns, count) task_reports = yield(task, columns, count)
@dataset_utils.showTaskReport(task_reports) next_config_diff = task_reports.map{|hash| hash[DatasetUtils::RUN_DONE]}.flatten.compact
next_config_diff = task_reports.map{|hash| hash["done"]}.flatten.compact @dataset_utils.showTaskReport(next_config_diff)
element_output = @dataset_utils.initialIngestionFileExist() ? "new file" : "change" element_output = @dataset_utils.initialIngestionFileExist() ? "new file" : "change"
@logger.info("#{next_config_diff.length} #{element_output}(s) ingested.", print=TRUE) @logger.info("#{next_config_diff.length} #{element_output}(s) ingested.", print=TRUE)
if(next_config_diff.length == count) if(next_config_diff.length == count)
...@@ -194,7 +195,7 @@ module Embulk ...@@ -194,7 +195,7 @@ module Embulk
@wendelin.increaseDatasetVersion(@data_set) @wendelin.increaseDatasetVersion(@data_set)
@dataset_utils.deleteStagedFile() @dataset_utils.deleteStagedFile()
else else
failed_tasks = task_reports.map{|hash| hash["error"]}.flatten.compact failed_tasks = task_reports.map{|hash| hash[DatasetUtils::RUN_ERROR] || hash[DatasetUtils::RUN_ABORTED] }.flatten.compact
@dataset_utils.showTaskErrors(failed_tasks) @dataset_utils.showTaskErrors(failed_tasks)
end end
next_config_diff = {} next_config_diff = {}
...@@ -202,9 +203,9 @@ module Embulk ...@@ -202,9 +203,9 @@ module Embulk
end end
def initialize(task, schema, index, page_builder) def initialize(task, schema, index, page_builder)
super super
@supplier = task['supplier'] @supplier = task['supplier']
@dataset = task['data_set'] @dataset = task['data_set']
@chunk_size = task['chunk_size'] @chunk_size = task['chunk_size']
@data_set_directory = task['data_set_directory'] @data_set_directory = task['data_set_directory']
@logger = LogManager.instance() @logger = LogManager.instance()
...@@ -219,16 +220,19 @@ module Embulk ...@@ -219,16 +220,19 @@ module Embulk
size = file_dict["size"] size = file_dict["size"]
hash = file_dict["hash"] hash = file_dict["hash"]
delete = hash == DatasetUtils::DELETE delete = hash == DatasetUtils::DELETE
rename = file_dict["status"] == DatasetUtils::STATUS_RENAMED
if size == "" and hash == "" #new file if size == "" and hash == "" #new file
size = File.size(path) size = File.size(path)
hash = @dataset_utils.getHash(path) hash = @dataset_utils.getHash(path)
end end
new_filename, new_extension, new_reference = @dataset_utils.getPathInfo(file_dict["new_path"], @dataset) if rename
filename, extension, reference = @dataset_utils.getPathInfo(path, @dataset) filename, extension, reference = @dataset_utils.getPathInfo(path, @dataset)
@dataset_utils.saveCurrentOperation(DatasetUtils::INGESTION, reference) operation = rename ? DatasetUtils::RENAME : DatasetUtils::INGESTION
each_chunk(path, filename, extension, size, hash, schema[1..-1].map{|elm| elm.name}, @chunk_size, delete) do |entry| @dataset_utils.saveCurrentOperation(operation, reference, new_reference)
each_chunk(path, filename, extension, size, hash, schema[1..-1].map{|elm| elm.name}, @chunk_size, delete, new_reference) do |entry|
@page_builder.add(entry) @page_builder.add(entry)
end end
@page_builder.finish @page_builder.finish
rescue java.lang.OutOfMemoryError rescue java.lang.OutOfMemoryError
@logger.logOutOfMemoryError(path) @logger.logOutOfMemoryError(path)
return_value = DatasetUtils::RUN_ABORTED return_value = DatasetUtils::RUN_ABORTED
...@@ -247,7 +251,7 @@ module Embulk ...@@ -247,7 +251,7 @@ module Embulk
end end
else else
if @dataset_utils.reportFileExist() if @dataset_utils.reportFileExist()
@dataset_utils.addToReport(reference, return_value, size, hash, task['data_set']) @dataset_utils.addToReport(reference, return_value, size, hash, task['data_set'], new_reference)
end end
end end
end end
...@@ -257,29 +261,33 @@ module Embulk ...@@ -257,29 +261,33 @@ module Embulk
private private
def each_chunk(path, filename, extension, size, hash, fields, chunk_size=DatasetUtils::CHUNK_SIZE, delete=FALSE) def each_chunk(path, filename, extension, size, hash, fields, chunk_size=DatasetUtils::CHUNK_SIZE, delete=FALSE, new_reference=FALSE)
if delete if delete
File.delete(path) if File.exist?(path) File.delete(path) if File.exist?(path)
values = [@supplier, @dataset, filename, extension, "", DatasetUtils::DELETE, "", ""] values = [@supplier, @dataset, filename, extension, "", DatasetUtils::DELETE, "", ""]
yield(values) yield(values)
elsif new_reference
File.delete(path) if File.exist?(path)
values = [@supplier, @dataset, filename, extension, new_reference, DatasetUtils::RENAME, "", ""]
yield(values)
else else
file_object = File.open(path, "rb") file_object = File.open(path, "rb")
npart = 0 npart = 0
next_byte = file_object.read(1) next_byte = file_object.read(1)
first = TRUE first = TRUE
while true while true
data = next_byte data = next_byte
if not next_byte if not next_byte
if first # this means this is an empty file if first # this means this is an empty file
values = [@supplier, @dataset, filename, extension, "", "", size, hash] values = [@supplier, @dataset, filename, extension, "", "", size, hash]
yield(values) yield(values)
end end
break break
end end
data += file_object.read(chunk_size) data += file_object.read(chunk_size)
next_byte = file_object.read(1) next_byte = file_object.read(1)
if not next_byte if not next_byte
eof = DatasetUtils::EOF eof = DatasetUtils::EOF
if first # this means that the whole file will be ingested at once (not split) if first # this means that the whole file will be ingested at once (not split)
eof = "" eof = ""
end end
......
...@@ -38,43 +38,43 @@ module Embulk ...@@ -38,43 +38,43 @@ module Embulk
end end
def self.askUserForAction(task, action) def self.askUserForAction(task, action)
if action == RESUME if action == RESUME
action_message = "#{RESUME}: Resume. Continues download from last file." action_message = "#{RESUME}: Resume. Continues download from last file."
else else
action = UPDATE action = UPDATE
action_message = "#{UPDATE}: Update. Checks for changes in dataset." action_message = "#{UPDATE}: Update. Checks for changes in dataset."
end end
valid_option = FALSE valid_option = FALSE
while not valid_option while not valid_option
@logger.info("Please select an option [#{action}, #{DOWNLOAD}, #{ABORT}]", print=TRUE) @logger.info("Please select an option [#{action}, #{DOWNLOAD}, #{ABORT}]", print=TRUE)
@logger.info(action_message, print=TRUE) @logger.info(action_message, print=TRUE)
@logger.info("#{DOWNLOAD}: Download. Downloads the dataset from scratch.", print=TRUE) @logger.info("#{DOWNLOAD}: Download. Downloads the dataset from scratch.", print=TRUE)
@logger.info("#{ABORT}: Abort operation.", print=TRUE) @logger.info("#{ABORT}: Abort operation.", print=TRUE)
option = gets option = gets
option = option.chomp option = option.chomp
if not [action, DOWNLOAD, ABORT].include? option if not [action, DOWNLOAD, ABORT].include? option
@logger.info("Invalid option", print=TRUE) @logger.info("Invalid option", print=TRUE)
else else
valid_option = TRUE valid_option = TRUE
end end
end end
case option case option
when action when action
@logger.info("Checking remote changes and posible local conflicts...", print=TRUE) if action != RESUME @logger.info("Checking remote changes and posible local conflicts...", print=TRUE) if action != RESUME
task['data_streams'] = @dataset_utils.getRemoteChangedDataStreams(task['data_streams']) task['data_streams'] = @dataset_utils.getRemoteChangedDataStreams(task['data_streams'], @data_set)
self.warnConflicts(task['data_streams'], task['data_set']) if action != RESUME self.warnConflicts(task['data_streams'], task['data_set']) if action != RESUME
@dataset_utils.deleteCompletedFile() @dataset_utils.deleteCompletedFile()
if task['data_streams'].empty? if task['data_streams'].empty?
@logger.info("Your downloaded dataset is already up to date.", print=TRUE) @logger.info("Your downloaded dataset is already up to date.", print=TRUE)
end end
when DOWNLOAD when DOWNLOAD
@logger.info("Checking remote files and posible local conflicts...", print=TRUE) @logger.info("Checking remote files and posible local conflicts...", print=TRUE)
self.warnConflicts(task['data_streams'], task['data_set']) self.warnConflicts(task['data_streams'], task['data_set'])
@dataset_utils.deleteCompletedFile() @dataset_utils.deleteCompletedFile()
@dataset_utils.createReportFile() @dataset_utils.createReportFile()
when ABORT when ABORT
@logger.abortExecution() @logger.abortExecution()
end end
end end
def self.transaction(config, &control) def self.transaction(config, &control)
...@@ -95,7 +95,6 @@ module Embulk ...@@ -95,7 +95,6 @@ module Embulk
@logger.abortExecution() @logger.abortExecution()
end end
@wendelin = WendelinClient.new(@erp5_url, @user, @password) @wendelin = WendelinClient.new(@erp5_url, @user, @password)
task = { task = {
'erp5_url' => @erp5_url, 'erp5_url' => @erp5_url,
'data_set' => @data_set, 'data_set' => @data_set,
...@@ -113,6 +112,17 @@ module Embulk ...@@ -113,6 +112,17 @@ module Embulk
task['data_set_directory'] = @dataset_utils.appendSlashTo(@output_path) task['data_set_directory'] = @dataset_utils.appendSlashTo(@output_path)
@data_set_directory = task['data_set_directory'] @data_set_directory = task['data_set_directory']
@dataset_utils = DatasetUtils.new(@data_set_directory) @dataset_utils = DatasetUtils.new(@data_set_directory)
if @dataset_utils.reportFileExist() && @dataset_utils.completedFileExist() && @dataset_utils.discardChangesFileExist() && ! @dataset_utils.initialIngestionFileExist()
task['discard_changes'] = @dataset_utils.discardChangesFileExist()
local_changes = @dataset_utils.getRemoteFileListForDiscardLocalChanges([], @data_set, check_changes=TRUE)
if local_changes.empty?
puts
@logger.info("No local changes to discard.", print=TRUE)
puts
@dataset_utils.deleteDiscardChangesFile()
@logger.abortExecution(error=FALSE)
end
end
@logger.info("Getting remote file list from dataset '#{@data_set}'...", print=TRUE) @logger.info("Getting remote file list from dataset '#{@data_set}'...", print=TRUE)
data_stream_list = @wendelin.getDataStreams(@data_set) data_stream_list = @wendelin.getDataStreams(@data_set)
if data_stream_list["status_code"] == 0 if data_stream_list["status_code"] == 0
...@@ -129,10 +139,26 @@ module Embulk ...@@ -129,10 +139,26 @@ module Embulk
if @dataset_utils.reportFileExist() if @dataset_utils.reportFileExist()
if @dataset_utils.completedFileExist() if @dataset_utils.completedFileExist()
puts if task['discard_changes']
@logger.info("This dataset was already downloaded. What do you want to do?", print=TRUE) puts
puts @logger.warn("All your local changes will be discarded.", print=TRUE)
self.askUserForAction(task, action=UPDATE) @logger.warn("Do you want to continue? (y/n)", print=TRUE)
option = gets
option = option.chomp
if option == "n" or option == "N"
@logger.info("Download cancelled by user.", print=TRUE)
@dataset_utils.deleteDiscardChangesFile()
@logger.abortExecution(error=FALSE)
end
@dataset_utils.deleteStagedFile()
task['data_streams'] = @dataset_utils.getRemoteFileListForDiscardLocalChanges(task['data_streams'], @data_set,
check_changes=FALSE, changes=local_changes)
else
puts
@logger.info("This dataset was already downloaded. What do you want to do?", print=TRUE)
puts
self.askUserForAction(task, action=UPDATE)
end
elsif not @dataset_utils.initialIngestionFileExist() elsif not @dataset_utils.initialIngestionFileExist()
puts puts
@logger.info("There was a previous attempt to download this dataset but it did not finish successfully.", print=TRUE) @logger.info("There was a previous attempt to download this dataset but it did not finish successfully.", print=TRUE)
...@@ -140,10 +166,20 @@ module Embulk ...@@ -140,10 +166,20 @@ module Embulk
puts puts
self.askUserForAction(task, action=RESUME) self.askUserForAction(task, action=RESUME)
else else
if @dataset_utils.discardChangesFileExist()
puts
@logger.info("Discard changes feature do not apply in current dataset directory status.", print=TRUE)
@logger.info("Continuing with dataset download.", print=TRUE)
end
puts puts
self.askUserForAction(task, action=UPDATE) self.askUserForAction(task, action=UPDATE)
end end
else else
if @dataset_utils.discardChangesFileExist()
puts
@logger.info("Discard changes feature do not apply in current dataset directory status.", print=TRUE)
@logger.info("Continuing with dataset download.", print=TRUE)
end
if not @dataset_utils.dirEmpty(@data_set_directory) if not @dataset_utils.dirEmpty(@data_set_directory)
puts puts
@logger.info("Dataset download directory is not empty! Its files could be overwritten: " + @data_set_directory, print=TRUE) @logger.info("Dataset download directory is not empty! Its files could be overwritten: " + @data_set_directory, print=TRUE)
...@@ -151,8 +187,8 @@ module Embulk ...@@ -151,8 +187,8 @@ module Embulk
option = gets option = gets
option = option.chomp option = option.chomp
if option == "n" if option == "n"
@logger.info("Download cancelled by user.", print=TRUE) @logger.info("Download cancelled by user.", print=TRUE)
@logger.abortExecution(error=FALSE) @logger.abortExecution(error=FALSE)
end end
@logger.info("Checking remote files and posible local conflicts...", print=TRUE) @logger.info("Checking remote files and posible local conflicts...", print=TRUE)
self.warnConflicts(task['data_streams'], task['data_set']) self.warnConflicts(task['data_streams'], task['data_set'])
...@@ -160,11 +196,13 @@ module Embulk ...@@ -160,11 +196,13 @@ module Embulk
@dataset_utils.createReportFile() @dataset_utils.createReportFile()
end end
@dataset_utils.deleteInitialIngestionFile() @dataset_utils.deleteInitialIngestionFile()
@dataset_utils.deleteDiscardChangesFile()
columns = [ columns = [
Column.new(0, "reference", :string), Column.new(0, "reference", :string),
Column.new(1, "data_chunk", :string), Column.new(1, "data_chunk", :string),
Column.new(2, "data_set", :string), Column.new(2, "data_set", :string),
Column.new(3, "mode", :string) Column.new(3, "mode", :string),
Column.new(4, "rename", :string)
] ]
resume(task, columns, task['data_streams'].length, &control) resume(task, columns, task['data_streams'].length, &control)
rescue Exception => e rescue Exception => e
...@@ -177,14 +215,18 @@ module Embulk ...@@ -177,14 +215,18 @@ module Embulk
def self.resume(task, columns, count, &control) def self.resume(task, columns, count, &control)
@logger = LogManager.instance() @logger = LogManager.instance()
task_reports = yield(task, columns, count) task_reports = yield(task, columns, count)
@dataset_utils.showTaskReport(task_reports) @dataset_utils.showTaskReport(task_reports) if not task['discard_changes']
next_config_diff = task_reports.map{|hash| hash[DatasetUtils::RUN_DONE]}.flatten.compact next_config_diff = task_reports.map{|hash| hash[DatasetUtils::RUN_DONE]}.flatten.compact
if(next_config_diff.length == count) if(next_config_diff.length == count)
if(count > 0) if(count > 0)
@logger.info("Dataset successfully downloaded.", print=TRUE) if task['discard_changes']
@logger.info("#{count} files processed.", print=TRUE) @logger.info("All local changes were discarded.", print=TRUE)
@logger.info("Dataset files are in dataset directory: " + @data_set_directory, print=TRUE) else
@logger.info("Dataset successfully downloaded.", print=TRUE)
@logger.info("#{count} files processed.", print=TRUE)
@logger.info("Dataset files are in dataset directory: " + @data_set_directory, print=TRUE)
end
end end
@dataset_utils.createCompletedFile() @dataset_utils.createCompletedFile()
else else
...@@ -200,8 +242,8 @@ module Embulk ...@@ -200,8 +242,8 @@ module Embulk
end end
def initialize(task, schema, index, page_builder) def initialize(task, schema, index, page_builder)
super super
@data_set = task['data_set'] @data_set = task['data_set']
@chunk_size = task['chunk_size'] @chunk_size = task['chunk_size']
@data_set_directory = task['data_set_directory'] @data_set_directory = task['data_set_directory']
@wendelin = WendelinClient.new(task['erp5_url'], task['user'], task['password']) @wendelin = WendelinClient.new(task['erp5_url'], task['user'], task['password'])
...@@ -215,27 +257,33 @@ module Embulk ...@@ -215,27 +257,33 @@ module Embulk
ref = data_stream["reference"] ref = data_stream["reference"]
size = data_stream["size"] size = data_stream["size"]
hash = data_stream["hash"] hash = data_stream["hash"]
renamed = data_stream["status"] == DatasetUtils::STATUS_RENAMED
deleted = hash.to_s == DatasetUtils::DELETE
begin begin
if hash.to_s == DatasetUtils::DELETE if deleted
@logger.info("Deleting #{ref}", print=TRUE) entry = [ref, "", @data_set, DatasetUtils::DELETE, renamed]
entry = [ref, "", @data_set, hash.to_s] page_builder.add(entry)
elsif renamed
new_reference = data_stream["new_reference"]
entry = [ref, new_reference, @data_set, TRUE, renamed]
page_builder.add(entry) page_builder.add(entry)
else else
@logger.info("Discarding local change on '#{data_stream["path"]}'", print=TRUE) if task['discard_changes']
@logger.info("Getting content from remote #{ref}", print=TRUE) @logger.info("Getting content from remote #{ref}", print=TRUE)
n_chunk = 0 n_chunk = 0
@wendelin.eachDataStreamContentChunk(id, @chunk_size) do |chunk| @wendelin.eachDataStreamContentChunk(id, @chunk_size) do |chunk|
content = chunk.nil? || chunk.empty? ? "" : Base64.encode64(chunk) content = chunk.nil? || chunk.empty? ? "" : Base64.encode64(chunk)
begin_of_file = n_chunk == 0 begin_of_file = n_chunk == 0
entry = [ref, content, @data_set, begin_of_file] entry = [ref, content, @data_set, begin_of_file, renamed]
page_builder.add(entry) page_builder.add(entry)
n_chunk += 1 n_chunk += 1
end end
end end
page_builder.finish page_builder.finish
rescue java.lang.OutOfMemoryError rescue java.lang.OutOfMemoryError
@logger.logOutOfMemoryError(ref) @logger.logOutOfMemoryError(ref)
return_value = DatasetUtils::RUN_ABORTED return_value = DatasetUtils::RUN_ABORTED
rescue Exception => e rescue Exception => e
@logger.error(e.to_s, print=TRUE) @logger.error(e.to_s, print=TRUE)
@logger.error(e.backtrace) @logger.error(e.backtrace)
puts "[INFO] For more detailed information, please refer to the log file: " + @logger.getLogPath() puts "[INFO] For more detailed information, please refer to the log file: " + @logger.getLogPath()
...@@ -244,13 +292,13 @@ module Embulk ...@@ -244,13 +292,13 @@ module Embulk
return_value = DatasetUtils::RUN_DONE return_value = DatasetUtils::RUN_DONE
end end
if return_value == DatasetUtils::RUN_DONE if return_value == DatasetUtils::RUN_DONE
if hash.to_s == DatasetUtils::DELETE if deleted
@dataset_utils.deleteFromReport(ref, return_value) @dataset_utils.deleteFromReport(ref, return_value)
else else
@dataset_utils.addToReport(ref, return_value, size, hash, task['data_set']) @dataset_utils.addToReport(ref, return_value, size, hash, task['data_set'], new_reference)
end end
end end
return {return_value => ref} return {return_value => ref}
end end
end end
end end
......
...@@ -11,15 +11,15 @@ module Embulk ...@@ -11,15 +11,15 @@ module Embulk
def self.transaction(config, schema, count, &control) def self.transaction(config, schema, count, &control)
@logger = LogManager.instance() @logger = LogManager.instance()
task = { "output_path" => config.param("output_path", :string, :default => nil) } task = { "output_path" => config.param("output_path", :string, :default => nil) }
if File.directory?(task['output_path']) if File.directory?(task['output_path'])
else else
@logger.error("Output directory not found.", print=TRUE) @logger.error("Output directory not found.", print=TRUE)
@logger.abortExecution() @logger.abortExecution()
end end
task_reports = yield(task) task_reports = yield(task)
next_config_diff = {} next_config_diff = {}
return next_config_diff return next_config_diff
end end
def init def init
...@@ -32,29 +32,37 @@ module Embulk ...@@ -32,29 +32,37 @@ module Embulk
def add(page) def add(page)
begin begin
page.each do |record| page.each do |record|
reference = record[0] reference = record[0]
data_chunk = Base64.decode64(record[1])
@dataset_utils = DatasetUtils.new("") @dataset_utils = DatasetUtils.new("")
data_set_directory = @dataset_utils.appendSlashTo(@output_path) data_set_directory = @dataset_utils.appendSlashTo(@output_path)
file_path = @dataset_utils.referenceToPath(reference, data_set_directory, record[2]) file_path = @dataset_utils.referenceToPath(reference, data_set_directory, record[2])
write_mode = 'ab' write_mode = 'ab'
if record[3] == DatasetUtils::DELETE if record[3] == DatasetUtils::DELETE
@logger.info("Deleting '#{file_path}'", print=TRUE)
File.delete(file_path) if File.exist?(file_path) File.delete(file_path) if File.exist?(file_path)
elsif record[4] == TRUE.to_s # if renamed
new_file_path = @dataset_utils.referenceToPath(record[1], data_set_directory, record[2])
@logger.info("Renaming '#{file_path}' to '#{new_file_path}'", print=TRUE)
unless File.directory?(File.dirname(new_file_path))
FileUtils.mkdir_p(File.dirname(new_file_path))
end
FileUtils.mv(file_path, new_file_path) if File.exist?(file_path)
else else
data_chunk = Base64.decode64(record[1])
if record[3] == TRUE.to_s if record[3] == TRUE.to_s
write_mode = 'w' write_mode = 'w'
end end
dirname = File.dirname(file_path) dirname = File.dirname(file_path)
unless File.directory?(dirname) unless File.directory?(dirname)
FileUtils.mkdir_p(dirname) FileUtils.mkdir_p(dirname)
end end
File.open(file_path, write_mode) { |file| file.write(data_chunk) } File.open(file_path, write_mode) { |file| file.write(data_chunk) }
end end
end end
rescue Exception => e rescue Exception => e
@logger.error("An error occurred while procesing file.", print=TRUE) @logger.error("An error occurred while procesing file.", print=TRUE)
@logger.error(e.backtrace) @logger.error(e.backtrace)
raise e raise e
end end
end end
...@@ -66,8 +74,8 @@ module Embulk ...@@ -66,8 +74,8 @@ module Embulk
end end
def commit def commit
task_report = {} task_report = {}
return task_report return task_report
end end
end end
......
...@@ -9,33 +9,33 @@ module Embulk ...@@ -9,33 +9,33 @@ module Embulk
Plugin.register_output("wendelin", self) Plugin.register_output("wendelin", self)
def self.transaction(config, schema, count, &control) def self.transaction(config, schema, count, &control)
task = { task = {
"erp5_url" => config.param("erp5_url", :string), "erp5_url" => config.param("erp5_url", :string),
"user" => config.param("user", :string, defualt: nil), "user" => config.param("user", :string, defualt: nil),
"password" => config.param("password", :string, default: nil), "password" => config.param("password", :string, default: nil),
"path_prefix" => config.param("path_prefix", :string, :default => nil), "path_prefix" => config.param("path_prefix", :string, :default => nil),
} }
task_reports = yield(task) task_reports = yield(task)
next_config_diff = {} next_config_diff = {}
@logger = LogManager.instance() @logger = LogManager.instance()
@logger.info("Your ingested files will be available in the site in a few minutes. Thank for your patience.", print=TRUE) @logger.info("Your ingested files will be available in the site in a few minutes. Thank for your patience.", print=TRUE)
return next_config_diff return next_config_diff
end end
def init def init
credentials = {} credentials = {}
@erp5_url = task["erp5_url"] @erp5_url = task["erp5_url"]
@user = task["user"] @user = task["user"]
@password = task["password"] @password = task["password"]
@logger = LogManager.instance() @logger = LogManager.instance()
@wendelin = WendelinClient.new(@erp5_url, @user, @password) @wendelin = WendelinClient.new(@erp5_url, @user, @password)
end end
def close def close
end end
def add(page) def add(page)
page.each do |record| page.each do |record|
supplier = (record[0].nil? || record[0].empty?) ? "default" : record[0] supplier = (record[0].nil? || record[0].empty?) ? "default" : record[0]
dataset = (record[1].nil? || record[1].empty?) ? "default" : record[1] dataset = (record[1].nil? || record[1].empty?) ? "default" : record[1]
filename = record[2] filename = record[2]
...@@ -48,18 +48,21 @@ module Embulk ...@@ -48,18 +48,21 @@ module Embulk
if eof == DatasetUtils::DELETE if eof == DatasetUtils::DELETE
reference = [dataset, filename, extension].join(DatasetUtils::REFERENCE_SEPARATOR) reference = [dataset, filename, extension].join(DatasetUtils::REFERENCE_SEPARATOR)
@wendelin.delete(reference) @wendelin.delete(reference)
elsif eof == DatasetUtils::RENAME
reference = [dataset, filename, extension].join(DatasetUtils::REFERENCE_SEPARATOR)
@wendelin.rename(reference, record[4].to_s)
else else
reference = [supplier, dataset, filename, extension, eof, size, hash].join(DatasetUtils::REFERENCE_SEPARATOR) reference = [supplier, dataset, filename, extension, eof, size, hash].join(DatasetUtils::REFERENCE_SEPARATOR)
split = eof != "" split = eof != ""
if not @wendelin.ingest(reference, data_chunk, split) if not @wendelin.ingest(reference, data_chunk, split)
raise "could not ingest" raise "could not ingest"
end end
end end
rescue Exception => e rescue Exception => e
raise e raise e
@logger.error(e.backtrace) @logger.error(e.backtrace)
end end
end end
end end
def finish def finish
...@@ -69,8 +72,8 @@ module Embulk ...@@ -69,8 +72,8 @@ module Embulk
end end
def commit def commit
task_report = {} task_report = {}
return task_report return task_report
end end
end end
......
...@@ -24,42 +24,42 @@ module Embulk ...@@ -24,42 +24,42 @@ module Embulk
tool_dir = config.param('tool_dir', :string, default: ".") tool_dir = config.param('tool_dir', :string, default: ".")
@logger = LogManager.instance() @logger = LogManager.instance()
@logger.setFilename(tool_dir, "parser") @logger.setFilename(tool_dir, "parser")
task = { task = {
chunk_size: config.param('chunk_size', :float, default: 0) * DatasetUtils::MEGA, chunk_size: config.param('chunk_size', :float, default: 0) * DatasetUtils::MEGA,
supplier: config.param("supplier", :string, default: "parser"), supplier: config.param("supplier", :string, default: "parser"),
data_set: config.param("data_set", :string), data_set: config.param("data_set", :string),
input_plugin: config.param("storage", :string, default: "parser"), input_plugin: config.param("storage", :string, default: "parser"),
date: Time.now.strftime("%Y-%m-%d_%H-%M-%S") date: Time.now.strftime("%Y-%m-%d_%H-%M-%S")
} }
if task['chunk_size'] == 0 if task['chunk_size'] == 0
task['chunk_size'] = DatasetUtils::CHUNK_SIZE task['chunk_size'] = DatasetUtils::CHUNK_SIZE
end end
columns = [ columns = [
Column.new(0, "supplier", :string), Column.new(0, "supplier", :string),
Column.new(1, "data_set", :string), Column.new(1, "data_set", :string),
Column.new(2, "file", :string), Column.new(2, "file", :string),
Column.new(3, "extension", :string), Column.new(3, "extension", :string),
Column.new(4, "data_chunk", :string), Column.new(4, "data_chunk", :string),
Column.new(5, "eof", :string), Column.new(5, "eof", :string),
Column.new(6, "size", :string), Column.new(6, "size", :string),
Column.new(7, "hash", :string) Column.new(7, "hash", :string)
] ]
yield(task, columns) yield(task, columns)
end end
def run(file_input) def run(file_input)
@index = Index.instance().get() @index = Index.instance().get()
@logger = LogManager.instance() @logger = LogManager.instance()
while file = file_input.next_file while file = file_input.next_file
begin begin
filename = "file_from_#{task['input_plugin']}_#{task['date']}" filename = "file_from_#{task['input_plugin']}_#{task['date']}"
each_chunk(file, filename, task['chunk_size']) do |record| each_chunk(file, filename, task['chunk_size']) do |record|
@page_builder.add(record) @page_builder.add(record)
end end
@page_builder.finish @page_builder.finish
Index.instance().increase() Index.instance().increase()
rescue java.lang.OutOfMemoryError rescue java.lang.OutOfMemoryError
@logger.logOutOfMemoryError(path) @logger.logOutOfMemoryError(path)
return return
rescue Exception => e rescue Exception => e
...@@ -67,18 +67,18 @@ module Embulk ...@@ -67,18 +67,18 @@ module Embulk
@logger.error(e.backtrace) @logger.error(e.backtrace)
puts "[INFO] For more detailed information, please refer to the log file: " + @logger.getLogPath() puts "[INFO] For more detailed information, please refer to the log file: " + @logger.getLogPath()
end end
end end
end end
private private
def each_chunk(file, filename, chunk_size=DatasetUtils::CHUNK_SIZE) def each_chunk(file, filename, chunk_size=DatasetUtils::CHUNK_SIZE)
extension = @index.to_s.rjust(3, "0") extension = @index.to_s.rjust(3, "0")
npart = 0 npart = 0
next_byte = file.read(1) next_byte = file.read(1)
first = TRUE first = TRUE
while true while true
data = next_byte data = next_byte
if not next_byte if not next_byte
if first if first
# this means this is an empty file # this means this is an empty file
values = [task['supplier'], task['data_set'], filename, extension, "", "", "", ""] values = [task['supplier'], task['data_set'], filename, extension, "", "", "", ""]
...@@ -86,10 +86,10 @@ module Embulk ...@@ -86,10 +86,10 @@ module Embulk
end end
break break
end end
data += file.read(chunk_size) data += file.read(chunk_size)
next_byte = file.read(1) next_byte = file.read(1)
if not next_byte if not next_byte
eof = DatasetUtils::EOF eof = DatasetUtils::EOF
if first if first
# this means that the whole file will be ingested at once (not split) # this means that the whole file will be ingested at once (not split)
eof = "" eof = ""
......
...@@ -16,8 +16,15 @@ class WendelinClient ...@@ -16,8 +16,15 @@ class WendelinClient
@last_ingestion = Time.new - 2 @last_ingestion = Time.new - 2
end end
def checkReferenceChars(reference)
if ["&", ";", "#", "%", '"', "+"].any? { |char| reference.include?(char) }
raise "invalid char in filename. Following chars are not allowed for filenames: \& \; \% \" \+ \# Please rename it."
end
end
def exists(reference) def exists(reference)
uri = URI("#{@erp5_url}/ingestionReferenceExists?reference=#{reference}") checkReferenceChars(reference)
uri = URI(URI.escape("#{@erp5_url}/ingestionReferenceExists?reference=#{reference}"))
begin begin
res = open(uri, http_basic_authentication: [@user, @password]).read res = open(uri, http_basic_authentication: [@user, @password]).read
rescue Exception => e rescue Exception => e
...@@ -34,11 +41,25 @@ class WendelinClient ...@@ -34,11 +41,25 @@ class WendelinClient
def delete(reference) def delete(reference)
@logger.info("Deletion requested for reference #{reference}", print=TRUE) @logger.info("Deletion requested for reference #{reference}", print=TRUE)
uri = URI("#{@erp5_url}/ERP5Site_invalidateIngestionObjects?reference=#{reference}") checkReferenceChars(reference)
uri = URI(URI.escape("#{@erp5_url}/ERP5Site_invalidateIngestionObjects?reference=#{reference}"))
res = handleRequest(uri) res = handleRequest(uri)
if res == FALSE if res == FALSE
@logger.abortExecution() @logger.abortExecution()
end end
@logger.info("Remote file successfully ingested.", print=TRUE)
end
def rename(reference, new_reference)
@logger.info("Rename requested for reference #{reference}, new reference #{new_reference}", print=TRUE)
checkReferenceChars(reference)
checkReferenceChars(new_reference)
uri = URI(URI.escape("#{@erp5_url}/ERP5Site_renameIngestion?reference=#{reference}&new_reference=#{new_reference}"))
res = handleRequest(uri)
if res == FALSE
@logger.abortExecution()
end
@logger.info("Remote file successfully renamed.", print=TRUE)
end end
def increaseDatasetVersion(reference) def increaseDatasetVersion(reference)
...@@ -46,12 +67,12 @@ class WendelinClient ...@@ -46,12 +67,12 @@ class WendelinClient
@logger.warn("Could not increase data set version because dataset reference is empty.") @logger.warn("Could not increase data set version because dataset reference is empty.")
else else
@logger.info("Increasing dataset version") @logger.info("Increasing dataset version")
uri = URI("#{@erp5_url}/ERP5Site_increaseDatasetVersion?reference=#{reference}") uri = URI(URI.escape("#{@erp5_url}/ERP5Site_increaseDatasetVersion?reference=#{reference}"))
begin begin
res = open(uri, http_basic_authentication: [@user, @password]).read res = open(uri, http_basic_authentication: [@user, @password]).read
rescue Exception => e rescue Exception => e
@logger.error("An error occurred while increasing dataset version: " + e.to_s) @logger.error("An error occurred while increasing dataset version: " + e.to_s)
@logger.error(e.backtrace) @logger.error(e.backtrace)
end end
end end
end end
...@@ -63,21 +84,13 @@ class WendelinClient ...@@ -63,21 +84,13 @@ class WendelinClient
sleep 3 sleep 3
end end
if exists(reference) if exists(reference)
@logger.info("There is another ingestion already done for the pair dataset-filename. Reference "\ @logger.info("There is another ingestion already done for the pair dataset-filename. Reference "\
+ reference, print=TRUE) + reference, print=TRUE)
@logger.info("Rename your file or download the full dataset to make local changes.", print=TRUE) @logger.info("Rename your file or download the full dataset to make local changes.", print=TRUE)
return FALSE return FALSE
end
if reference.include? "#" or reference.include? "+"
raise "invalid chars in file name. Please rename it."
end
begin
uri = URI("#{@erp5_url}/ingest?reference=#{reference}")
rescue Exception => e
@logger.error("An error occurred while generating url: " + e.to_s)
@logger.error(e.backtrace)
raise "invalid chars in file name. Please rename it."
end end
checkReferenceChars(reference)
uri = URI(URI.escape("#{@erp5_url}/ingest?reference=#{reference}"))
response = handleRequest(uri, reference, data_chunk) response = handleRequest(uri, reference, data_chunk)
if response == FALSE if response == FALSE
return FALSE return FALSE
...@@ -88,28 +101,28 @@ class WendelinClient ...@@ -88,28 +101,28 @@ class WendelinClient
end end
def eachDataStreamContentChunk(id, chunk_size) def eachDataStreamContentChunk(id, chunk_size)
uri = URI("#{@erp5_url}#{id}/getData") uri = URI(URI.escape("#{@erp5_url}#{id}/getData"))
@logger.info("Downloading...", print=TRUE) @logger.info("Downloading...", print=TRUE)
first = TRUE first = TRUE
res = open(uri, http_basic_authentication: [@user, @password]) { res = open(uri, http_basic_authentication: [@user, @password]) {
|content| |content|
while true while true
chunk = content.read(chunk_size) chunk = content.read(chunk_size)
if chunk.nil? if chunk.nil?
if first if first
yield chunk yield chunk
end end
@logger.info("Done", print=TRUE) @logger.info("Done", print=TRUE)
break break
end end
first = FALSE first = FALSE
yield chunk yield chunk
end end
} }
end end
def getDataStreams(data_set_reference) def getDataStreams(data_set_reference)
uri = URI("#{@erp5_url}getDataStreamList?data_set_reference=#{data_set_reference}") uri = URI(URI.escape("#{@erp5_url}getDataStreamList?data_set_reference=#{data_set_reference}"))
str = handleRequest(uri) str = handleRequest(uri)
if str == FALSE if str == FALSE
@logger.abortExecution() @logger.abortExecution()
...@@ -127,44 +140,44 @@ class WendelinClient ...@@ -127,44 +140,44 @@ class WendelinClient
req.basic_auth @user, @password req.basic_auth @user, @password
if data_chunk != nil if data_chunk != nil
@logger.info("Setting request form data...", print=TRUE) @logger.info("Setting request form data...", print=TRUE)
begin begin
req.set_form_data('data_chunk' => data_chunk) req.set_form_data('data_chunk' => data_chunk)
rescue java.lang.OutOfMemoryError rescue java.lang.OutOfMemoryError
@logger.logOutOfMemoryError(reference) @logger.logOutOfMemoryError(reference)
return FALSE return FALSE
end end
@logger.info("Sending record:'#{reference}'...", print=TRUE) @logger.info("Sending record:'#{reference}'...", print=TRUE)
end end
begin begin
res = Net::HTTP.start(uri.hostname, uri.port, res = Net::HTTP.start(uri.hostname, uri.port,
:use_ssl => (uri.scheme == 'https'), :use_ssl => (uri.scheme == 'https'),
:verify_mode => OpenSSL::SSL::VERIFY_NONE, :verify_mode => OpenSSL::SSL::VERIFY_NONE,
:ssl_timeout => 300, :open_timeout => 300, :read_timeout => 300, :ssl_timeout => 300, :open_timeout => 300, :read_timeout => 300,
) do |http| ) do |http|
http.request(req) http.request(req)
end end
rescue Exception => e rescue Exception => e
@logger.error("HTTP ERROR: " + e.to_s, print=TRUE) @logger.error("HTTP ERROR: " + e.to_s, print=TRUE)
@logger.error(e.backtrace) @logger.error(e.backtrace)
return FALSE return FALSE
else else
if res.kind_of?(Net::HTTPSuccess) # res.code is 2XX if res.kind_of?(Net::HTTPSuccess) # res.code is 2XX
@logger.info("Done") @logger.info("Done")
return res.body return res.body
else else
@logger.error("HTTP FAIL - code: #{res.code}", print=TRUE) @logger.error("HTTP FAIL - code: #{res.code}", print=TRUE)
if res.code == '500' or res.code == '502' or res.code == '503' if res.code == '500' or res.code == '502' or res.code == '503'
@logger.error("Internal Server Error: if the error persists, please contact the administrator.", print=TRUE) @logger.error("Internal Server Error: if the error persists, please contact the administrator.", print=TRUE)
elsif res.code == '401' elsif res.code == '401'
@logger.error("Unauthorized access. Please check your user credentials and try again.", print=TRUE) @logger.error("Unauthorized access. Please check your user credentials and try again.", print=TRUE)
@logger.abortExecution() @logger.abortExecution()
else else
@logger.error("Sorry, an error ocurred. If the error persists, please contact the administrator.", print=TRUE) @logger.error("Sorry, an error ocurred. If the error persists, please contact the administrator.", print=TRUE)
end end
return FALSE return FALSE
end end
end end
end end
end end
ebulk ingest-download tool examples
Basic ingestion/download
ebulk pull <DATASET>
* downloads the content of target dataset
ebulk push <DATASET>
* ingests files into the target dataset
ebulk pull <DATASET> -d <PATH>
* downloads the content of target dataset in target PATH
* future operations on PATH directory will use the DATASET reference implicitly
ebulk push <DATASET> -c 20
* ingests files into the <DATASET> splitting them in chunks of 20MB
ebulk push <DATASET> -s <STORAGE>
* ingests the content of the input storage [http, ftp, s3] into the target dataset
ebulk push <DATASET> -s <STORAGE> --advanced
* allows the user to edit the configuration file of the selected storage
ebulk push <DATASET> --custom-storage
* user can install and configure a new input plugin storage
Manage local changes
ebulk status <DATASET>
* checks local changes of target dataset
ebulk add <PATH>
* marks files in path for ingestion
ebulk remove <PATH>
* marks files in path for deletion
ebulk reset <PATH>
* resets marked files in path
ebulk pull --discard-changes
* discards local changes by checking the remote dataset
ebulk ingest-download tool help ebulk ingest-download tool help
usage: ebulk <command> <dataset> [options...] ebulk [-h|--help] [-r|--readme] [-e|--examples] <command> [<args>]
[-d|--directory <path>] [-c|--chunk <size>]
[-s|--storage <storage>] [-cs|--custom-storage]
[-a|--advanced] [-dc|--discard-changes]
commands: commands:
pull <dataset> Downloads the content of the target dataset from the site into the output folder pull [<dataset>] Downloads the content of the target dataset from the site into the output location
push <dataset> Ingests the content of the input folder into a target dataset on the site push [<dataset>] Ingests the content of the input location into a target dataset on the site
-h, --help Tool help status [<dataset>] Lists the local changes of target dataset
-r, --readme Opens README file add <path> Marks new or modified files in path for ingestion
remove <path> Marks files in path for removal
reset <path> Resets marked files in path
-h, --help Tool help
-r, --readme Opens README file
-e, --examples Shows some tool usage examples
argument: argument:
dataset Mandatory. Unique reference for the target dataset dataset argument Unique reference for the target dataset
If empty, current directory will be used as dataset directory and reference
It must start with a letter, and only alphanumerics, dots ( . ), underscores ( _ ) and hyphens ( - ) are allowed It must start with a letter, and only alphanumerics, dots ( . ), underscores ( _ ) and hyphens ( - ) are allowed
* For download, the reference must be one of the available datasets on the site * For download, the reference must be one of the available datasets on the site
* For ingestion, an existing reference will append the files to the corresponding dataset * For ingestion, an existing reference will append the files to the corresponding dataset
* A new reference will create a new dataset on the site * A new reference will create a new dataset on the site
It could be a path, then the last directory will be interpreted as the reference It could be a path, then that directory will be used as dataset reference
e.g. pull my_directory/sample/ --> dataset reference will be "sample" e.g. pull my_directory/sample/ --> dataset reference will be "sample"
options: options:
-d, --directory <path> Besides the dataset reference, sets the dataset directory and it links that location to the reference -d, --directory <path> Besides the dataset reference, sets the dataset directory and it links that location to the reference
-c, --chunk <chunk> Sets the chunk size (in megabytes) to split large files -c, --chunk <size> Sets the chunk size (in megabytes) to split large files
-s, --storage <storage> Uses the selected input storage from this set: [http, ftp, s3] -s, --storage <storage> Uses the selected input storage from this set: [http, ftp, s3]
-cs, --custom-storage Allows user to set a new input storage. -cs, --custom-storage Allows user to set a new input storage.
-a, --advanced Allows to edit the Embulk cofiguration file of the input storage -a, --advanced Allows to edit the Embulk cofiguration file of the input storage
-dc, --discard-changes Discards local changes by checking the remote dataset
examples:
ebulk pull <DATASET>
* downloads the content of target dataset
ebulk push <DATASET>
* ingests files into the target dataset
ebulk pull <DATASET> -d <PATH>
* downloads the content of target dataset in target PATH
* future operations on PATH directory will use the DATASET reference implicitly
ebulk push <DATASET> -c 20
* ingests files into the <DATASET> splitting them in chunks of 20MB
ebulk push <DATASET> -s <STORAGE>
* ingests the content of the input storage [http, ftp, s3] into the target dataset
ebulk push <DATASET> -s <STORAGE> --advanced
* allows the user to edit the configuration file of the selected storage
ebulk push <DATASET> --custom-storage
* user can install and configure a new input plugin storage
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment