Commit 3ff3b171 authored by Alexandra Rogova's avatar Alexandra Rogova

checks if file is loaded before downloading -> incremental index

parent 6986911d
jszip @ 8742db3a
Subproject commit 8742db3a5f725f6651948018d77be3499059814d
......@@ -5,6 +5,7 @@
<script src="../../external/jio/external/rsvp-2.0.4.js"></script>
<script src="../../external/jio/dist/jio-latest.js"></script>
<script src="../../external/renderjs/dist/renderjs-latest.js"></script>
<script src="../../external/jszip/dist/jszip.js"></script>
<script src="../js/search.js"></script>
<link rel="stylesheet" type="text/css" href="../../css/mynij.css">
</head>
......
......@@ -6,40 +6,78 @@
.declareAcquiredMethod("add_to_index", "add_to_index")
.declareAcquiredMethod("is_db_empty", "is_db_empty")
.declareAcquiredMethod("loaded", "loaded")
.declareAcquiredMethod("all_loaded", "all_loaded")
.allowPublicAcquisition("add", function (page) {
return this.add_to_index(page[0]);
})
.allowPublicAcquisition("add_file", function(file_name){
return this.loaded(file_name);
})
.setState({
to_load: [
// "44_svt.xml", //136 urls
"44_svt.xml", //135 urls
// "allemandfacile.xml", //650 urls
// "anglaisfacile.xml", //567 urls
// "bescherelle.xml", //65 urls
// "codeacademy.xml", //27 urls
// "bescherelle.xml", //60 urls
// "codeacademy.xml", //28 urls
// "francaisfacile.xml", //1119 urls
// "hgeo_college.xml", //226 urls
// "histoirencours.xml", //1415 urls
// "italienfacile.xml", //1477 urls
// "jerevise.xml", //918 urls
//"hgeo_college.xml", //227 urls
//"histoirencours.xml", //1415 urls
//"italienfacile.xml", //1477 urls
//"jerevise.xml", //918 urls
// "junior_science_et_vie.xml", //532 urls
// "kmusic.xml", //106 urls ---
"larousse.xml", //4563 urls
// "kmusic.xml", //107 urls
// "larousse.xml", //4563 urls
// //"letudiant.xml", //41649 urls
// "lewebpedagogique.xml", //298 urls
// //"livrespourtous.xml", //12061 urls
// "mathovore.xml", //2221 urls
// "monanneeaucollege.xml", //120 urls
// "mathovore.xml", //2221 urls
// "monanneeaucollege.xml", //121 urls
// "nosdevoirs.xml", //462 urls
// "physagreg.xml", //150 urls
// "physique_chimie_college.xml", //282 urls
// "reviser_brevet.xml", //229 urls
// "soutien67.xml", //1604 urls
// //"superprof.xml", //12296 urls
// "technologieaucollege27.xml", //128 urls
// "espagnolfacile.xml", //3352 urls
// "vivelessvt.xml" //1257 urls
// "technologieaucollege27.xml", //129 urls
// "espagnolfacile.xml", //3352 urls
// "vivelessvt.xml", //1257 urls
// // TEST SITEMAPS TO FILL INDEX
// "20minutes.xml", //2741 urls
// "independent.xml", //3094 urls
// "lille.xml", //1742 urls
// "metronews.xml", //3585 urls
// "paris.xml", //338 urls
// "pypi.xml", //6685 urls
// "theguardian.xml", //5104 urls
// "linuxfr.xml", //3515 urls
// "lyon.xml", //3560 urls
// "lavoixdunord.xml", //3407 urls
// "python.xml", //2503 urls
// "reddit.xml", //1563 urls
// "thesun.xml", //2668 urls
// "sputniknews.xml", //1581 urls
// "nytimes.xml", //1604 urls
// "cnn.xml", //3047 urls
// "cnbc.xml", //3346 urls
// "bbc.xml", //2594 urls
// "vox.xml", //1194 urls
// "cbsnews.xml", //1260 urls
// "mirror.xml", //3528 urls
// "abcnews.xml", //1077 urls
// "lequipe.xml", //3455 urls
// "rugbyrama.xml", //1817 urls
// "elle.xml", //3532 urls
// "figaro.xml", //2965 urls
// "lepoint.xml", //3747 urls
// "telerama.xml", //2593 urls
// "liberation.xml", //819 urls
// "lemonde.xml", //3517 urls
// "leparisien.xml", //2189 urls
// "latribune.xml" //3190 urls
]
})
......@@ -57,7 +95,31 @@
var gadget = this,
promise_list = [];
return gadget.is_db_empty()
return gadget.all_loaded()
.push(function(result){
var i,
file_path,
file_name;
if (result === null){
for (i = 0; i < gadget.state.to_load.length; i += 1){
file_path = "../../../crawler_test/" + gadget.state.to_load[i];
file_name = gadget.state.to_load[i];
promise_list.push(gadget.load_file(file_path, file_name));
}
return RSVP.all(promise_list);
} else {
for (i = 0; i < gadget.state.to_load.length; i += 1){
file_name = gadget.state.to_load[i];
if (Object.keys(result).indexOf(file_name) < 0){
file_path = "../../../crawler_test/" + gadget.state.to_load[i];
promise_list.push(gadget.load_file(file_path, file_name));
}
}
return RSVP.all(promise_list);
}
});
/* return gadget.is_db_empty()
.push(function(empty){
if (empty) {
for (var i=0; i<gadget.state.to_load.length; i+=1){
......@@ -65,18 +127,18 @@
}
return RSVP.all(promise_list);
}
});
});*/
})
.declareMethod("load_file", function(file_link){ //OK
.declareMethod("load_file", function(file_path, file_name){ //OK
var gadget = this;
console.log("loading " + file_link);
console.log("loading " + file_path);
return new RSVP.Queue()
.push(function(){
return jIO.util.ajax({url : file_link});
return jIO.util.ajax({url : file_path});
})
.push(function(file){
return gadget.state.parser_gadget.concurrent_parse(file.currentTarget.responseText);
return gadget.state.parser_gadget.concurrent_parse(file.currentTarget.responseText, file_name);
})
.push(undefined, function (my_error) {console.log(my_error)});
});
......
......@@ -22,22 +22,6 @@
}
);
/*db = jIO.createJIO(
{
type : "zip",
sub_storage : {
type : "query",
sub_storage : {
type : "uuid",
sub_storage : {
type : "indexeddb",
database : "mynij"
}
}
}
}
);*/
this.changeState({
index : index,
db : db,
......@@ -47,40 +31,55 @@
return this._load_index();
})
.declareMethod("get", function(id){
return this.state.db.get(id);
})
.declareMethod("get_loaded", function(){
var gadget = this;
return this.state.db.get("loaded")
.push(function(result){
return result;
})
.push(undefined, function (my_error) {return null});
})
.declareMethod("add_page", function(page_info){ //page_info = {link, title, description, item}
var gadget = this,
tmp;
tmp = page_info;
tmp.portal_type = "page";
return gadget.state.db.post(tmp)
return gadget.state.db.put(page_info.link, tmp)
.push(function(){
return gadget.state.db.post(tmp);
var defer = RSVP.defer();
gadget.state.index.add(page_info.link, page_info.title + " " + page_info.item, defer.resolve.bind(defer));
return defer.promise;
})
.push(function(){
for (var i = 0; i < 5; i += 1){
gadget.state.index.add("title_"+page_info.link+"_"+i, page_info.title);
gadget.state.index.add("body_"+page_info.link+"_"+i, page_info.item);
}
})
/* .push(function(){
return gadget._save_index();
})*/;
});
})
.declareMethod("loaded", function(file_name){
var gadget = this,
id;
return gadget.state.db.get("loaded")
.push(function(result){
var tmp = result;
tmp[file_name] = true;
return gadget.state.db.put("loaded", tmp);
})
.push(undefined, function (my_error) {
var tmp = {};
tmp[file_name] = true;
return gadget.state.db.put("loaded", tmp);
});
})
.declareMethod("_save_index", function(){ //OK
//console.log("saving index");
var gadget = this;
var stringified = this.state.index.export(this.state.msgpack);
if (this.state.index_id){
return gadget.state.db.put(gadget.state.db.index_id, {"portal_type" : "index", "index" : stringified});
} else {
return gadget.state.db.post({"portal_type" : "index", "index" : stringified})
.push(function(created_id){
return gadget.changeState({
index_id : created_id
});
});
}
var serialized = this.state.index.export_test();
return this.state.db.put("index", serialized);
})
.declareMethod("search", function(query){
......@@ -88,31 +87,13 @@
})
.declareMethod("_load_index", function(msgpack){ //OK
var gadget = this,
query = 'portal_type:"index"',
id;
return this.state.db.allDocs({"query" : query})
.push(function(result){
if (result.data.total_rows !== 0){
console.log("index found");
id = result.data.rows[0].id;
return gadget.state.db.get(result.data.rows[0].id)
.push(function(result){
console.log("started index import");
var tmp_index = FlexSearch.create("memory");
tmp_index.import(result.index, gadget.state.msgpack);
console.log("index import done");
gadget.changeState({
index : tmp_index
});
})
.push(function(){
return gadget.changeState({
index_id : id
});
});
}
});
var gadget = this;
return gadget.state.db.get("index")
.push(function(index){
gadget.state.index.import_test(index.ids, index.map, index.ctx);
console.log("index imported from memory");
})
.push(undefined, function (my_error) {});
})
.declareMethod("is_empty", function(){
......@@ -124,15 +105,14 @@
.declareMethod("get_index", function(){
console.log(this.state.index.info());
console.log(this.state.msgpack);
return this.state.index.export(this.state.msgpack);
//return this.state.index.export({serialize: false});
//return this.state.index.export(this.state.msgpack);
return this.state.index.export_test();
})
.declareMethod("add_index", function(serialized_index){
.declareMethod("add_index", function(ids, map, ctx){
console.log("adding index");
this.state.index.import(serialized_index, this.state.msgpack);
//this.state.index.import(serialized_index, {serialize: false});
//this.state.index.import(serialized_index, this.state.msgpack);
this.state.index.import_test(ids, map, ctx);
console.log(this.state.index.info());
});
......
......@@ -3,7 +3,7 @@
(function(window, RSVP, rJS, jIO) {
"use strict";
function dispatchQueue(context, function_used, argument_list, number_queue, callback) {
function dispatchQueue(context, function_used, argument_list, number_queue) {
var result_promise_list = [],
i,
defer;
......@@ -16,7 +16,6 @@
if (argument_list.length > 0) {
function_used.apply(context, argument_list.shift())
.then(function(result) {
callback(result);
pushAndExecute(global_defer);
})
.fail(function(error) {
......@@ -41,44 +40,47 @@
rJS(window)
.declareAcquiredMethod("add", "add")
.declareAcquiredMethod("add_file", "add_file")
.declareMethod("concurrent_parse", function(links_file){
var gadget = this,
links = new DOMParser().parseFromString(links_file, "text/xml").getElementsByTagName("url"),
links_modified = [],
i;
var callback_method = function(page){
var item,
result;
if (page !== undefined){
item = new DOMParser().parseFromString(page.currentTarget.response, "text/html");
result = {
link : page.currentTarget.responseURL.slice("https://softinst116265.host.vifib.net/erp5/ERP5Site_getHTTPResource?url=".length),
title : item.title,
//description : item.querySelector('meta[name="description"]').getAttribute('content'),
description : "",
item : item.getElementsByTagName("body")[0].innerText
};
return gadget.add(result);
}
};
.declareMethod("concurrent_parse", function(links_file, file_name){
var gadget = this,
links = new DOMParser().parseFromString(links_file, "text/xml").getElementsByTagName("url"),
links_modified = [],
i;
for (i=0; i<links.length; i+=1){
links_modified[i] = [links[i].getElementsByTagName('loc')[0].textContent];
}
return new RSVP.Queue().push(function() {
return dispatchQueue(this, gadget._get, links_modified, 2, callback_method);
return dispatchQueue(gadget, gadget._get, links_modified, 1);
})
.push(function(){
return gadget.add_file(file_name);
});
})
.declareMethod("_get", function(link){
var gadget = this;
return new RSVP.Queue()
.push(function(){
var rng = Math.floor(Math.random() * Math.floor(10));
if (rng % 2 === 0 ) return jIO.util.ajax({url : "https://softinst116265.host.vifib.net/erp5/ERP5Site_getHTTPResource?url=" + link});
else return jIO.util.ajax({url : "https://softinst116446.host.vifib.net/erp5/ERP5Site_getHTTPResource?url=" + link});
})
.push(function(page){
var item,
result;
if (page !== undefined){
item = new DOMParser().parseFromString(page.currentTarget.response, "text/html");
result = {
link : page.currentTarget.responseURL.slice("https://softinst116265.host.vifib.net/erp5/ERP5Site_getHTTPResource?url=".length),
title : item.title,
//description : item.querySelector('meta[name="description"]').getAttribute('content'),
description : "",
item : item.getElementsByTagName("body")[0].innerText
};
return gadget.add(result);
}
})
.push(undefined, function (my_error) {console.log(my_error)});
});
......
......@@ -16,6 +16,14 @@
.allowPublicAcquisition("is_db_empty", function(){
return this.state.model_gadget.is_empty();
})
.allowPublicAcquisition("loaded", function(file_name){
return this.state.model_gadget.loaded(file_name);
})
.allowPublicAcquisition("all_loaded", function(){
return this.state.model_gadget.get_loaded();
})
.ready(function(){
var model_gadget,
......@@ -54,33 +62,56 @@
.declareMethod("init_buttons", function(){
var event_handler,
upload_handler,
i,
gadget = this;
upload_handler,
i,
gadget = this;
event_handler = function(event){
if (event.target.value === "export"){
return gadget.state.model_gadget.get_index()
.push(function(serialized_index){
console.log("stringified");
var a = document.createElement('a');
a.href = URL.createObjectURL(new Blob([serialized_index]));
a.download = 'index.bin';
a.click();
.push(function(index){
var zip = new JSZip();
zip.file("index_ids.mynij", index.ids);
zip.file("index_map.mynij", index.map);
zip.file("index_ctx.mynij", index.ctx);
zip.generateAsync({type : "blob"})
.then(function(ziped){
var a = document.createElement('a');
a.href = URL.createObjectURL(ziped);
a.download = 'index.zip';
a.click();
});
});
}
};
upload_handler = function(){
var file_list = this.files,
reader = new FileReader();
reader.onload = function (evt) {
var view = new Uint8Array(evt.target.result);
gadget.state.model_gadget.add_index(view);
var file = this.files[0],
reader = new FileReader(),
zip = new JSZip();
reader.onload = function(event){
//return gadget.state.model_gadget.add_index(event.target.result);
var view = new Uint8Array(event.target.result),
load = function(file_name){
return zip.loadAsync(view)
.then(function(){
return zip.file(file_name).async("string");
})
.then(function(data){
return data;
});
},
promises = [];
promises.push(load("index_ids.mynij"));
promises.push(load("index_map.mynij"));
promises.push(load("index_ctx.mynij"));
Promise.all(promises)
.then(function(result){
return gadget.state.model_gadget.add_index(result[0], result[1], result[2]);
});
};
for (i = 0; i < file_list.length; i += 1){
reader.readAsArrayBuffer(file_list[i], "Uint8Array");
}
//reader.readAsText(file);
reader.readAsArrayBuffer(file);
};
document.getElementById("export").addEventListener("click", event_handler);
......@@ -96,7 +127,6 @@
})
.push(function(result){
console.log("search done");
console.log(result);
if (result.length === 0) {
return gadget.state.result_gadget.addItem({
title : "No results found",
......@@ -105,9 +135,18 @@
} else {
var i, promise_list = [];
for (i=0; i<result.length; i+=1){
promise_list.push(gadget.state.result_gadget.addItem(result[i].doc, key));
promise_list.push(gadget.state.model_gadget.get(result[i]));
}
return RSVP.all(promise_list);
return new RSVP.Queue()
.push(function(){
return RSVP.all(promise_list);
})
.push(function(result){
var i, promise_list = [];
for (i=0; i<result.length; i+=1){
promise_list.push(gadget.state.result_gadget.addItem(result[i], key));
}
});
}
});
})
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment