Commit d1f914ac authored by Xiaowu Zhang's avatar Xiaowu Zhang

parameter

parent 507c1beb
......@@ -17,6 +17,10 @@ var args = require("yargs")
.nargs("link", 1)
.nargs("file", 1)
.nargs("depth", 1)
.nargs("include_html", 1)
.nargs("include_js", 1)
.nargs("include_css", 1)
.nargs("include_html", 1)
.argv;
fs.open(args.file, 'w', function(err, file){
......@@ -26,6 +30,10 @@ fs.open(args.file, 'w', function(err, file){
var depth = 3,
count = 1,
include_html = true,
include_js = false,
include_css = false,
include_header = false,
link = args.link,
builder = require("xmlbuilder"),
readline = require('readline'),
......@@ -33,8 +41,24 @@ var depth = 3,
url_list = [],
crawler = new SCrawler(link);
if (args.depth) depth = args.depth;
if (args.depth) {
depth = args.depth;
}
if (args.include_html) {
include_html = (args.include_html === "True");
}
if (args.include_js) {
include_js = (args.include_js === "True");
}
if (args.include_css) {
include_css = (args.include_css === "True");
}
if (args.include_header) {
include_header = (args.include_header === "True");
}
crawler.interval = 250;
crawler.maxConcurrency = 5;
crawler.maxDepth = depth;
......@@ -43,13 +67,20 @@ crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
readline.cursorTo(process.stdout, 0);
process.stdout.write(count + "");
count+=1;
url_list.push({
"loc": queueItem.url,
"stateData": queueItem.stateData,
"referrer": queueItem.referrer
});
if (include_header) {
url_list.push({
"loc": queueItem.url,
"stateData": queueItem.stateData,
"referrer": queueItem.referrer
});
} else {
url_list.push({
"loc": queueItem.url
});
}
});
// Fire callback
crawler.on("complete", function() {
readline.cursorTo(process.stdout, 0);
......@@ -67,29 +98,38 @@ crawler.on("complete", function() {
crawler.on("fetcherror", function(queueItem, response) {
console.log("Error " + response.statusCode + " while fetching " + queueItem.url);
url_list.push({
"loc": queueItem.url,
"stateData": queueItem.stateData,
"referrer": queueItem.referrer
});
if (include_header) {
url_list.push({
"loc": queueItem.url,
"stateData": queueItem.stateData,
"referrer": queueItem.referrer
});
}
});
crawler.discoverResources = function(buffer, queueItem) {
var $ = cheerio.load(buffer.toString("utf8"));
var tag_a = $("a[href]").map(function () {
var link_list = [];
if (include_html) {
link_list = link_list.concat($("a[href]").map(function () {
return $(this).attr("href");
}).get();
var tag_link = $("link[href]").map(function () {
return $(this).attr("href");
}).get();
}).get())
}
if (include_css) {
console.log('************************');
console.log(include_css);
link_list = link_list.concat($("link[href]").map(function () {
return $(this).attr("href");
}).get());
}
var tag_script = $("script[src]").map(function () {
return $(this).attr("src");
}).get();
return tag_a.concat(tag_link).concat(tag_script);
if (include_js) {
link_list = link_list.concat($("script[src]").map(function () {
return $(this).attr("src");
}).get())
}
return link_list;
};
// Start Crawl
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment