使用 Node.js 开发资讯爬虫流程

2020-06-17 07:04:41易采站长站整理

)
)
})
return this.extractData
}

三、定时任务


cron 每天跑一跑
function job () {
let cronJob = new cron.CronJob({
cronTime: cronConfig.cronTime,
onTick: () => {
spider()
},
start: false
})
cronJob.start()
}

四、数据持久化

数据持久化理论上应该不属于爬虫关心的范围,用 mongoose ,创建Model


import mongoose from 'mongoose'
const Schema = mongoose.Schema
const NewsSchema = new Schema(
{
title: { type: 'String', required: true },
url: { type: 'String', required: true },
summary: String,
recommend: { type: Boolean, default: false },
source: { type: Number, required: true, default: 0 },
status: { type: Number, required: true, default: 0 },
createdTime: { type: Date, default: Date.now }
},
{
collection: 'news'
}
)
export default mongoose.model('news', NewsSchema)

基本操作


import { OBJ_STATUS } from '../../Constants'
class BaseService {
constructor (ObjModel) {
this.ObjModel = ObjModel
}

saveObject (objData) {
return new Promise((resolve, reject) => {
this.ObjModel(objData).save((err, result) => {
if (err) {
return reject(err)
}
return resolve(result)
})
})
}
}
export default BaseService

资讯


import BaseService from './BaseService'
import News from '../models/News'
class NewsService extends BaseService {}
export default new NewsService(News)

愉快地保存数据


await newsService.batchSave(newsListTem)

更多内容到Github把项目clone下来看就好了。

总结