Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
M
mynij-crawler
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Xiaowu Zhang
mynij-crawler
Commits
d1f914ac
Commit
d1f914ac
authored
Jun 14, 2022
by
Xiaowu Zhang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
parameter
parent
507c1beb
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
62 additions
and
22 deletions
+62
-22
crawler.js
crawler.js
+62
-22
No files found.
crawler.js
View file @
d1f914ac
...
...
@@ -17,6 +17,10 @@ var args = require("yargs")
.
nargs
(
"
link
"
,
1
)
.
nargs
(
"
file
"
,
1
)
.
nargs
(
"
depth
"
,
1
)
.
nargs
(
"
include_html
"
,
1
)
.
nargs
(
"
include_js
"
,
1
)
.
nargs
(
"
include_css
"
,
1
)
.
nargs
(
"
include_html
"
,
1
)
.
argv
;
fs
.
open
(
args
.
file
,
'
w
'
,
function
(
err
,
file
){
...
...
@@ -26,6 +30,10 @@ fs.open(args.file, 'w', function(err, file){
var
depth
=
3
,
count
=
1
,
include_html
=
true
,
include_js
=
false
,
include_css
=
false
,
include_header
=
false
,
link
=
args
.
link
,
builder
=
require
(
"
xmlbuilder
"
),
readline
=
require
(
'
readline
'
),
...
...
@@ -33,8 +41,24 @@ var depth = 3,
url_list
=
[],
crawler
=
new
SCrawler
(
link
);
if
(
args
.
depth
)
depth
=
args
.
depth
;
if
(
args
.
depth
)
{
depth
=
args
.
depth
;
}
if
(
args
.
include_html
)
{
include_html
=
(
args
.
include_html
===
"
True
"
);
}
if
(
args
.
include_js
)
{
include_js
=
(
args
.
include_js
===
"
True
"
);
}
if
(
args
.
include_css
)
{
include_css
=
(
args
.
include_css
===
"
True
"
);
}
if
(
args
.
include_header
)
{
include_header
=
(
args
.
include_header
===
"
True
"
);
}
crawler
.
interval
=
250
;
crawler
.
maxConcurrency
=
5
;
crawler
.
maxDepth
=
depth
;
...
...
@@ -43,13 +67,20 @@ crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
readline
.
cursorTo
(
process
.
stdout
,
0
);
process
.
stdout
.
write
(
count
+
""
);
count
+=
1
;
url_list
.
push
({
"
loc
"
:
queueItem
.
url
,
"
stateData
"
:
queueItem
.
stateData
,
"
referrer
"
:
queueItem
.
referrer
});
if
(
include_header
)
{
url_list
.
push
({
"
loc
"
:
queueItem
.
url
,
"
stateData
"
:
queueItem
.
stateData
,
"
referrer
"
:
queueItem
.
referrer
});
}
else
{
url_list
.
push
({
"
loc
"
:
queueItem
.
url
});
}
});
// Fire callback
crawler
.
on
(
"
complete
"
,
function
()
{
readline
.
cursorTo
(
process
.
stdout
,
0
);
...
...
@@ -67,29 +98,38 @@ crawler.on("complete", function() {
crawler
.
on
(
"
fetcherror
"
,
function
(
queueItem
,
response
)
{
console
.
log
(
"
Error
"
+
response
.
statusCode
+
"
while fetching
"
+
queueItem
.
url
);
url_list
.
push
({
"
loc
"
:
queueItem
.
url
,
"
stateData
"
:
queueItem
.
stateData
,
"
referrer
"
:
queueItem
.
referrer
});
if
(
include_header
)
{
url_list
.
push
({
"
loc
"
:
queueItem
.
url
,
"
stateData
"
:
queueItem
.
stateData
,
"
referrer
"
:
queueItem
.
referrer
});
}
});
crawler
.
discoverResources
=
function
(
buffer
,
queueItem
)
{
var
$
=
cheerio
.
load
(
buffer
.
toString
(
"
utf8
"
));
var
tag_a
=
$
(
"
a[href]
"
).
map
(
function
()
{
var
link_list
=
[];
if
(
include_html
)
{
link_list
=
link_list
.
concat
(
$
(
"
a[href]
"
).
map
(
function
()
{
return
$
(
this
).
attr
(
"
href
"
);
}).
get
();
var
tag_link
=
$
(
"
link[href]
"
).
map
(
function
()
{
return
$
(
this
).
attr
(
"
href
"
);
}).
get
();
}).
get
())
}
if
(
include_css
)
{
console
.
log
(
'
************************
'
);
console
.
log
(
include_css
);
link_list
=
link_list
.
concat
(
$
(
"
link[href]
"
).
map
(
function
()
{
return
$
(
this
).
attr
(
"
href
"
);
}).
get
());
}
var
tag_script
=
$
(
"
script[src]
"
).
map
(
function
()
{
return
$
(
this
).
attr
(
"
src
"
);
}).
get
();
return
tag_a
.
concat
(
tag_link
).
concat
(
tag_script
);
if
(
include_js
)
{
link_list
=
link_list
.
concat
(
$
(
"
script[src]
"
).
map
(
function
()
{
return
$
(
this
).
attr
(
"
src
"
);
}).
get
())
}
return
link_list
;
};
// Start Crawl
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment