Commit 3ff3b171 authored by Alexandra Rogova's avatar Alexandra Rogova

checks if file is loaded before downloading -> incremental index

parent 6986911d
jszip @ 8742db3a
Subproject commit 8742db3a5f725f6651948018d77be3499059814d
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
<script src="../../external/jio/external/rsvp-2.0.4.js"></script> <script src="../../external/jio/external/rsvp-2.0.4.js"></script>
<script src="../../external/jio/dist/jio-latest.js"></script> <script src="../../external/jio/dist/jio-latest.js"></script>
<script src="../../external/renderjs/dist/renderjs-latest.js"></script> <script src="../../external/renderjs/dist/renderjs-latest.js"></script>
<script src="../../external/jszip/dist/jszip.js"></script>
<script src="../js/search.js"></script> <script src="../js/search.js"></script>
<link rel="stylesheet" type="text/css" href="../../css/mynij.css"> <link rel="stylesheet" type="text/css" href="../../css/mynij.css">
</head> </head>
......
...@@ -6,40 +6,78 @@ ...@@ -6,40 +6,78 @@
.declareAcquiredMethod("add_to_index", "add_to_index") .declareAcquiredMethod("add_to_index", "add_to_index")
.declareAcquiredMethod("is_db_empty", "is_db_empty") .declareAcquiredMethod("is_db_empty", "is_db_empty")
.declareAcquiredMethod("loaded", "loaded")
.declareAcquiredMethod("all_loaded", "all_loaded")
.allowPublicAcquisition("add", function (page) { .allowPublicAcquisition("add", function (page) {
return this.add_to_index(page[0]); return this.add_to_index(page[0]);
}) })
.allowPublicAcquisition("add_file", function(file_name){
return this.loaded(file_name);
})
.setState({ .setState({
to_load: [ to_load: [
// "44_svt.xml", //136 urls "44_svt.xml", //135 urls
// "allemandfacile.xml", //650 urls // "allemandfacile.xml", //650 urls
// "anglaisfacile.xml", //567 urls // "anglaisfacile.xml", //567 urls
// "bescherelle.xml", //65 urls // "bescherelle.xml", //60 urls
// "codeacademy.xml", //27 urls // "codeacademy.xml", //28 urls
// "francaisfacile.xml", //1119 urls // "francaisfacile.xml", //1119 urls
// "hgeo_college.xml", //226 urls //"hgeo_college.xml", //227 urls
// "histoirencours.xml", //1415 urls //"histoirencours.xml", //1415 urls
// "italienfacile.xml", //1477 urls //"italienfacile.xml", //1477 urls
// "jerevise.xml", //918 urls //"jerevise.xml", //918 urls
// "junior_science_et_vie.xml", //532 urls // "junior_science_et_vie.xml", //532 urls
// "kmusic.xml", //106 urls --- // "kmusic.xml", //107 urls
"larousse.xml", //4563 urls // "larousse.xml", //4563 urls
// //"letudiant.xml", //41649 urls // //"letudiant.xml", //41649 urls
// "lewebpedagogique.xml", //298 urls // "lewebpedagogique.xml", //298 urls
// //"livrespourtous.xml", //12061 urls // //"livrespourtous.xml", //12061 urls
// "mathovore.xml", //2221 urls // "mathovore.xml", //2221 urls
// "monanneeaucollege.xml", //120 urls // "monanneeaucollege.xml", //121 urls
// "nosdevoirs.xml", //462 urls // "nosdevoirs.xml", //462 urls
// "physagreg.xml", //150 urls // "physagreg.xml", //150 urls
// "physique_chimie_college.xml", //282 urls // "physique_chimie_college.xml", //282 urls
// "reviser_brevet.xml", //229 urls // "reviser_brevet.xml", //229 urls
// "soutien67.xml", //1604 urls // "soutien67.xml", //1604 urls
// //"superprof.xml", //12296 urls // //"superprof.xml", //12296 urls
// "technologieaucollege27.xml", //128 urls // "technologieaucollege27.xml", //129 urls
// "espagnolfacile.xml", //3352 urls // "espagnolfacile.xml", //3352 urls
// "vivelessvt.xml" //1257 urls // "vivelessvt.xml", //1257 urls
// // TEST SITEMAPS TO FILL INDEX
// "20minutes.xml", //2741 urls
// "independent.xml", //3094 urls
// "lille.xml", //1742 urls
// "metronews.xml", //3585 urls
// "paris.xml", //338 urls
// "pypi.xml", //6685 urls
// "theguardian.xml", //5104 urls
// "linuxfr.xml", //3515 urls
// "lyon.xml", //3560 urls
// "lavoixdunord.xml", //3407 urls
// "python.xml", //2503 urls
// "reddit.xml", //1563 urls
// "thesun.xml", //2668 urls
// "sputniknews.xml", //1581 urls
// "nytimes.xml", //1604 urls
// "cnn.xml", //3047 urls
// "cnbc.xml", //3346 urls
// "bbc.xml", //2594 urls
// "vox.xml", //1194 urls
// "cbsnews.xml", //1260 urls
// "mirror.xml", //3528 urls
// "abcnews.xml", //1077 urls
// "lequipe.xml", //3455 urls
// "rugbyrama.xml", //1817 urls
// "elle.xml", //3532 urls
// "figaro.xml", //2965 urls
// "lepoint.xml", //3747 urls
// "telerama.xml", //2593 urls
// "liberation.xml", //819 urls
// "lemonde.xml", //3517 urls
// "leparisien.xml", //2189 urls
// "latribune.xml" //3190 urls
] ]
}) })
...@@ -57,7 +95,31 @@ ...@@ -57,7 +95,31 @@
var gadget = this, var gadget = this,
promise_list = []; promise_list = [];
return gadget.is_db_empty() return gadget.all_loaded()
.push(function(result){
var i,
file_path,
file_name;
if (result === null){
for (i = 0; i < gadget.state.to_load.length; i += 1){
file_path = "../../../crawler_test/" + gadget.state.to_load[i];
file_name = gadget.state.to_load[i];
promise_list.push(gadget.load_file(file_path, file_name));
}
return RSVP.all(promise_list);
} else {
for (i = 0; i < gadget.state.to_load.length; i += 1){
file_name = gadget.state.to_load[i];
if (Object.keys(result).indexOf(file_name) < 0){
file_path = "../../../crawler_test/" + gadget.state.to_load[i];
promise_list.push(gadget.load_file(file_path, file_name));
}
}
return RSVP.all(promise_list);
}
});
/* return gadget.is_db_empty()
.push(function(empty){ .push(function(empty){
if (empty) { if (empty) {
for (var i=0; i<gadget.state.to_load.length; i+=1){ for (var i=0; i<gadget.state.to_load.length; i+=1){
...@@ -65,18 +127,18 @@ ...@@ -65,18 +127,18 @@
} }
return RSVP.all(promise_list); return RSVP.all(promise_list);
} }
}); });*/
}) })
.declareMethod("load_file", function(file_link){ //OK .declareMethod("load_file", function(file_path, file_name){ //OK
var gadget = this; var gadget = this;
console.log("loading " + file_link); console.log("loading " + file_path);
return new RSVP.Queue() return new RSVP.Queue()
.push(function(){ .push(function(){
return jIO.util.ajax({url : file_link}); return jIO.util.ajax({url : file_path});
}) })
.push(function(file){ .push(function(file){
return gadget.state.parser_gadget.concurrent_parse(file.currentTarget.responseText); return gadget.state.parser_gadget.concurrent_parse(file.currentTarget.responseText, file_name);
}) })
.push(undefined, function (my_error) {console.log(my_error)}); .push(undefined, function (my_error) {console.log(my_error)});
}); });
......
...@@ -22,22 +22,6 @@ ...@@ -22,22 +22,6 @@
} }
); );
/*db = jIO.createJIO(
{
type : "zip",
sub_storage : {
type : "query",
sub_storage : {
type : "uuid",
sub_storage : {
type : "indexeddb",
database : "mynij"
}
}
}
}
);*/
this.changeState({ this.changeState({
index : index, index : index,
db : db, db : db,
...@@ -47,40 +31,55 @@ ...@@ -47,40 +31,55 @@
return this._load_index(); return this._load_index();
}) })
.declareMethod("get", function(id){
return this.state.db.get(id);
})
.declareMethod("get_loaded", function(){
var gadget = this;
return this.state.db.get("loaded")
.push(function(result){
return result;
})
.push(undefined, function (my_error) {return null});
})
.declareMethod("add_page", function(page_info){ //page_info = {link, title, description, item} .declareMethod("add_page", function(page_info){ //page_info = {link, title, description, item}
var gadget = this, var gadget = this,
tmp; tmp;
tmp = page_info; tmp = page_info;
tmp.portal_type = "page"; tmp.portal_type = "page";
return gadget.state.db.post(tmp) return gadget.state.db.put(page_info.link, tmp)
.push(function(){ .push(function(){
return gadget.state.db.post(tmp); var defer = RSVP.defer();
gadget.state.index.add(page_info.link, page_info.title + " " + page_info.item, defer.resolve.bind(defer));
return defer.promise;
}) })
.push(function(){ .push(function(){
for (var i = 0; i < 5; i += 1){
gadget.state.index.add("title_"+page_info.link+"_"+i, page_info.title);
gadget.state.index.add("body_"+page_info.link+"_"+i, page_info.item);
}
})
/* .push(function(){
return gadget._save_index(); return gadget._save_index();
})*/; });
}) })
.declareMethod("_save_index", function(){ //OK .declareMethod("loaded", function(file_name){
//console.log("saving index"); var gadget = this,
var gadget = this; id;
var stringified = this.state.index.export(this.state.msgpack);
if (this.state.index_id){ return gadget.state.db.get("loaded")
return gadget.state.db.put(gadget.state.db.index_id, {"portal_type" : "index", "index" : stringified}); .push(function(result){
} else { var tmp = result;
return gadget.state.db.post({"portal_type" : "index", "index" : stringified}) tmp[file_name] = true;
.push(function(created_id){ return gadget.state.db.put("loaded", tmp);
return gadget.changeState({ })
index_id : created_id .push(undefined, function (my_error) {
}); var tmp = {};
tmp[file_name] = true;
return gadget.state.db.put("loaded", tmp);
}); });
} })
.declareMethod("_save_index", function(){ //OK
var serialized = this.state.index.export_test();
return this.state.db.put("index", serialized);
}) })
.declareMethod("search", function(query){ .declareMethod("search", function(query){
...@@ -88,31 +87,13 @@ ...@@ -88,31 +87,13 @@
}) })
.declareMethod("_load_index", function(msgpack){ //OK .declareMethod("_load_index", function(msgpack){ //OK
var gadget = this, var gadget = this;
query = 'portal_type:"index"', return gadget.state.db.get("index")
id; .push(function(index){
return this.state.db.allDocs({"query" : query}) gadget.state.index.import_test(index.ids, index.map, index.ctx);
.push(function(result){ console.log("index imported from memory");
if (result.data.total_rows !== 0){
console.log("index found");
id = result.data.rows[0].id;
return gadget.state.db.get(result.data.rows[0].id)
.push(function(result){
console.log("started index import");
var tmp_index = FlexSearch.create("memory");
tmp_index.import(result.index, gadget.state.msgpack);
console.log("index import done");
gadget.changeState({
index : tmp_index
});
}) })
.push(function(){ .push(undefined, function (my_error) {});
return gadget.changeState({
index_id : id
});
});
}
});
}) })
.declareMethod("is_empty", function(){ .declareMethod("is_empty", function(){
...@@ -124,15 +105,14 @@ ...@@ -124,15 +105,14 @@
.declareMethod("get_index", function(){ .declareMethod("get_index", function(){
console.log(this.state.index.info()); console.log(this.state.index.info());
console.log(this.state.msgpack); //return this.state.index.export(this.state.msgpack);
return this.state.index.export(this.state.msgpack); return this.state.index.export_test();
//return this.state.index.export({serialize: false});
}) })
.declareMethod("add_index", function(serialized_index){ .declareMethod("add_index", function(ids, map, ctx){
console.log("adding index"); console.log("adding index");
this.state.index.import(serialized_index, this.state.msgpack); //this.state.index.import(serialized_index, this.state.msgpack);
//this.state.index.import(serialized_index, {serialize: false}); this.state.index.import_test(ids, map, ctx);
console.log(this.state.index.info()); console.log(this.state.index.info());
}); });
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
(function(window, RSVP, rJS, jIO) { (function(window, RSVP, rJS, jIO) {
"use strict"; "use strict";
function dispatchQueue(context, function_used, argument_list, number_queue, callback) { function dispatchQueue(context, function_used, argument_list, number_queue) {
var result_promise_list = [], var result_promise_list = [],
i, i,
defer; defer;
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
if (argument_list.length > 0) { if (argument_list.length > 0) {
function_used.apply(context, argument_list.shift()) function_used.apply(context, argument_list.shift())
.then(function(result) { .then(function(result) {
callback(result);
pushAndExecute(global_defer); pushAndExecute(global_defer);
}) })
.fail(function(error) { .fail(function(error) {
...@@ -41,14 +40,33 @@ ...@@ -41,14 +40,33 @@
rJS(window) rJS(window)
.declareAcquiredMethod("add", "add") .declareAcquiredMethod("add", "add")
.declareAcquiredMethod("add_file", "add_file")
.declareMethod("concurrent_parse", function(links_file){ .declareMethod("concurrent_parse", function(links_file, file_name){
var gadget = this, var gadget = this,
links = new DOMParser().parseFromString(links_file, "text/xml").getElementsByTagName("url"), links = new DOMParser().parseFromString(links_file, "text/xml").getElementsByTagName("url"),
links_modified = [], links_modified = [],
i; i;
for (i=0; i<links.length; i+=1){
links_modified[i] = [links[i].getElementsByTagName('loc')[0].textContent];
}
return new RSVP.Queue().push(function() {
return dispatchQueue(gadget, gadget._get, links_modified, 1);
})
.push(function(){
return gadget.add_file(file_name);
});
})
var callback_method = function(page){ .declareMethod("_get", function(link){
var gadget = this;
return new RSVP.Queue()
.push(function(){
var rng = Math.floor(Math.random() * Math.floor(10));
if (rng % 2 === 0 ) return jIO.util.ajax({url : "https://softinst116265.host.vifib.net/erp5/ERP5Site_getHTTPResource?url=" + link});
else return jIO.util.ajax({url : "https://softinst116446.host.vifib.net/erp5/ERP5Site_getHTTPResource?url=" + link});
})
.push(function(page){
var item, var item,
result; result;
if (page !== undefined){ if (page !== undefined){
...@@ -62,22 +80,6 @@ ...@@ -62,22 +80,6 @@
}; };
return gadget.add(result); return gadget.add(result);
} }
};
for (i=0; i<links.length; i+=1){
links_modified[i] = [links[i].getElementsByTagName('loc')[0].textContent];
}
return new RSVP.Queue().push(function() {
return dispatchQueue(this, gadget._get, links_modified, 2, callback_method);
});
})
.declareMethod("_get", function(link){
return new RSVP.Queue()
.push(function(){
var rng = Math.floor(Math.random() * Math.floor(10));
if (rng % 2 === 0 ) return jIO.util.ajax({url : "https://softinst116265.host.vifib.net/erp5/ERP5Site_getHTTPResource?url=" + link});
else return jIO.util.ajax({url : "https://softinst116446.host.vifib.net/erp5/ERP5Site_getHTTPResource?url=" + link});
}) })
.push(undefined, function (my_error) {console.log(my_error)}); .push(undefined, function (my_error) {console.log(my_error)});
}); });
......
...@@ -17,6 +17,14 @@ ...@@ -17,6 +17,14 @@
return this.state.model_gadget.is_empty(); return this.state.model_gadget.is_empty();
}) })
.allowPublicAcquisition("loaded", function(file_name){
return this.state.model_gadget.loaded(file_name);
})
.allowPublicAcquisition("all_loaded", function(){
return this.state.model_gadget.get_loaded();
})
.ready(function(){ .ready(function(){
var model_gadget, var model_gadget,
result_gadget, result_gadget,
...@@ -61,26 +69,49 @@ ...@@ -61,26 +69,49 @@
event_handler = function(event){ event_handler = function(event){
if (event.target.value === "export"){ if (event.target.value === "export"){
return gadget.state.model_gadget.get_index() return gadget.state.model_gadget.get_index()
.push(function(serialized_index){ .push(function(index){
console.log("stringified"); var zip = new JSZip();
zip.file("index_ids.mynij", index.ids);
zip.file("index_map.mynij", index.map);
zip.file("index_ctx.mynij", index.ctx);
zip.generateAsync({type : "blob"})
.then(function(ziped){
var a = document.createElement('a'); var a = document.createElement('a');
a.href = URL.createObjectURL(new Blob([serialized_index])); a.href = URL.createObjectURL(ziped);
a.download = 'index.bin'; a.download = 'index.zip';
a.click(); a.click();
}); });
});
} }
}; };
upload_handler = function(){ upload_handler = function(){
var file_list = this.files, var file = this.files[0],
reader = new FileReader(); reader = new FileReader(),
reader.onload = function (evt) { zip = new JSZip();
var view = new Uint8Array(evt.target.result); reader.onload = function(event){
gadget.state.model_gadget.add_index(view); //return gadget.state.model_gadget.add_index(event.target.result);
var view = new Uint8Array(event.target.result),
load = function(file_name){
return zip.loadAsync(view)
.then(function(){
return zip.file(file_name).async("string");
})
.then(function(data){
return data;
});
},
promises = [];
promises.push(load("index_ids.mynij"));
promises.push(load("index_map.mynij"));
promises.push(load("index_ctx.mynij"));
Promise.all(promises)
.then(function(result){
return gadget.state.model_gadget.add_index(result[0], result[1], result[2]);
});
}; };
for (i = 0; i < file_list.length; i += 1){ //reader.readAsText(file);
reader.readAsArrayBuffer(file_list[i], "Uint8Array"); reader.readAsArrayBuffer(file);
}
}; };
document.getElementById("export").addEventListener("click", event_handler); document.getElementById("export").addEventListener("click", event_handler);
...@@ -96,7 +127,6 @@ ...@@ -96,7 +127,6 @@
}) })
.push(function(result){ .push(function(result){
console.log("search done"); console.log("search done");
console.log(result);
if (result.length === 0) { if (result.length === 0) {
return gadget.state.result_gadget.addItem({ return gadget.state.result_gadget.addItem({
title : "No results found", title : "No results found",
...@@ -105,9 +135,18 @@ ...@@ -105,9 +135,18 @@
} else { } else {
var i, promise_list = []; var i, promise_list = [];
for (i=0; i<result.length; i+=1){ for (i=0; i<result.length; i+=1){
promise_list.push(gadget.state.result_gadget.addItem(result[i].doc, key)); promise_list.push(gadget.state.model_gadget.get(result[i]));
} }
return new RSVP.Queue()
.push(function(){
return RSVP.all(promise_list); return RSVP.all(promise_list);
})
.push(function(result){
var i, promise_list = [];
for (i=0; i<result.length; i+=1){
promise_list.push(gadget.state.result_gadget.addItem(result[i], key));
}
});
} }
}); });
}) })
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment