--maxsize number Limit the maximum size of the download file
--minwidth number Limit the minimum width of the download file
--minheight number Limit the minimum height of the download file
-i, --info View the configuration file
-l, --list array View the queue data
e.g. [page/down/queue],0,-1
-f, --find array Find the download URL of the local file
--json Print result to json format
-v, --version View version
-h, --help View help
最后分享一个配置
$ crawl-pet -u https://www.reddit.com/r/funny/ -o reddit --save groupinfo.json
{
"url": "https://www.reddit.com/r/funny/",
"outdir": ".",
"save": "group",
"types": "",
"limit": "5",
"parser": "my_parser.js",
"sleep": "200",
"timeout": "180000",
"proxy": "",
"maxsize": 0,
"minwidth": 0,
"minheight": 0, "cookie": "over18=1"
}
my_parser.js
exports.body = function(url, body, response, crawler_handle) {
const re = /b(data-url|href|src)s*=s*["']([^'"#]+)/ig
var m = null
while (m = re.exec(body)){
let href = m[2] if (/thumb|user|icon|.(css|json|js|xml|svg)b/i.test(href)) {
continue
}
if (/.(png|gif|jpg|jpeg|mp4)b/i.test(href)) {
crawler_handle.addDown(href)
continue
}
if(/reddit.com/r//i.test(href)){
crawler_handle.addPage(href)
}
}
crawler_handle.over()
}如果你是了解 reddit 的,那就这样了。
GIthub 地址在这里:https://github.com/wl879/Crawl-pet
本站下载地址:点击下载









