各种爬虫管道


title: 各种爬虫管道 date: 2021-09-10 09:36:45 categories:

  • IT技术
  • 爬虫
  • Scrapy tags:
  • IT技术
  • 爬虫
  • Scrapy

摘要:各种爬虫管道 from datetime import datetime from scrapy.exporters import JsonItemExporter, CsvItemExporter import pymongo import redis from .settings import REDIS_HOST, REDIS_PORT, MONGO_HOST, M

各种爬虫管道

各种爬虫管道

from datetime import datetime
from scrapy.exporters import JsonItemExporter, CsvItemExporter
import pymongo
import redis
from .settings import REDIS\_HOST, REDIS\_PORT, MONGO\_HOST, MONGO\_PORT

数据源的管道

class AqiDataPipeline(object):
    def process\_item(self, item, spider):
        # 记录爬取时间
        item['crawl\_time'] = datetime.utcnow()
        # 记录爬虫
        item['spider'] = spider.name
        return item

Json的管道

class AqiJsonPipeline(object):
    def open\_spider(self, spider):
        self.file = open("aqi.json", 'wb')
        self.write = JsonItemExporter(self.file)
        self.write.start\_exporting()
    def process\_item(self, item, spider):
        self.write.export\_item(item)
        return item
    def close\_spider(self, spider):
        self.write.finish\_exporting()
        self.file.close()

Csv的管道

class AqiVscPipeline(object):
    def open\_spider(self, spider):
        self.file = open("aqi.csv", 'wb')
        self.write = CsvItemExporter(self.file)
        self.write.start\_exporting()
    def process\_item(self, item, spider):
        self.write.export\_item(item)
        return item
    def close\_spider(self, spider):
        self.write.finish\_exporting()
        self.file.close()

mongodb数据库管道

class AqiMongoPipeline(object):
    def open\_spider(self, spider):
        self.client = pymongo.MongoClient(host=MONGO\_HOST, port=MONGO\_PORT)
        self.db = self.client['Aqi']
        self.collection = self.db['aqi']
    def process\_item(self, item, spider):
        self.collection.insert(dict(item))
        return item
    def close\_spider(self, spider):
        self.client.close()

redis数据库管道

class AqiRedisPipeline(object):
    def open\_spider(self, spider):
        self.client = redis.Redis(host=REDIS\_HOST, port=REDIS\_PORT)
    def process\_item(self, item, spider):
        self.client.lpush('aqi', dict(item))
        return item

来源网址:https://www.cnblogs.com/hanjian200ok/p/9526028.html

使用 Hugo 构建
主题 StackJimmy 设计