'GuideSpider' 对象不可切片。
'GuideSpider' 对象不可切片。
我有一个Scrapy项目,想要用它来爬取一些网站。
当我尝试将所有信息保存到MySQL数据库中时,出现了标题中的错误。
我到处都看了,发现这是一个“列表”问题,可能与items[]列表有关...
请帮助我理解这个错误的含义,并告诉我在哪里修复代码。
也请解释一下原因,因为我想要理解。非常感谢。
爬虫代码:
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.contrib.spiders.crawl import Rule, CrawlSpider from scrapy.selector import HtmlXPathSelector from gscrape.items import GscrapeItem class GuideSpider(CrawlSpider): name = "Dplay" allowed_domains = ['www.example.com'] start_urls = [ "http://www.examplea.com/forums/forumdisplay.php?f=108&order=desc&page=1" ] rules = ( Rule(SgmlLinkExtractor(allow=("forumdisplay.php.*f=108.*page=")), callback='parse_item', follow=True), ) def parse_item(self, response): hxs = HtmlXPathSelector(response) items = [] sites = hxs.select('//div') for site in sites: item = GscrapeItem() item['title'] = site.select('a[@class="threadcolor"]/text()').extract() item['guide_url'] = site.select('a[@class="threadcolor"]/@href').extract() item['subject'] = site.select('./text()[1]').extract() items.append(item) return items
管道代码:
from scrapy.exceptions import DropItem from string import join from scrapy import log from twisted.enterprise import adbapi import MySQLdb.cursors class GscrapePipeline(object): def process_item(self, item, spider): if item['guide_url']: item['guide_url'] = "http://www.example.com/forums/" + join(item['guide_url']) return item else: raise DropItem() class MySQLStorePipeline(object): def __init__(self): # @@@ hardcoded db settings # TODO: make settings configurable through settings self.dbpool = adbapi.ConnectionPool('MySQLdb', db='prova', host='127.0.0.1', user='root', passwd='', cursorclass=MySQLdb.cursors.DictCursor, charset='utf8', use_unicode=True ) def process_item(self, item, spider): # 在线程池中运行数据库查询 query = self.dbpool.runInteraction(self._conditional_insert, item) query.addErrback(self.handle_error) return item def _conditional_insert(self, tx, item): # 如果记录不存在,则创建记录 # 该块在自己的线程上运行 tx.execute("select * from prova where guide_url = %s", item['guide_url']) result = tx.fetchone() if result: log.msg("Item already stored in db: %s" % item, level=log.DEBUG) else: tx.execute( "insert into prova (title, guide_url, subject) " "values (%s, %s, %s)", (item['title'], item['guide_url'], item['subject'] ) ) log.msg("Item stored in db: %s" % item, level=log.DEBUG) def handle_error(self, e): log.err(e)
错误:exceptions.TypeError: 'GuideSpider' object is not subscriptable (line 47) pipelines.py