python - Scrapy pipeline to export csv file in the right format -
i made improvement according suggestion alexce below. need picture below. each row/line should 1 review: date, rating, review text , link.
i need let item processor process each review of every page.
takefirst() takes first review of page. 10 pages, have 10 lines/rows in picture below.
spider code below:
import scrapy amazon.items import amazonitem class amazonspider(scrapy.spider): name = "amazon" allowed_domains = ['amazon.co.uk'] start_urls = [ 'http://www.amazon.co.uk/product-reviews/b0042eu3a2/'.format(page) page in xrange(1,114) ] def parse(self, response): sel in response.xpath('//*[@id="productreviews"]//tr/td[1]'): item = amazonitem() item['rating'] = sel.xpath('div/div[2]/span[1]/span/@title').extract() item['date'] = sel.xpath('div/div[2]/span[2]/nobr/text()').extract() item['review'] = sel.xpath('div/div[6]/text()').extract() item['link'] = sel.xpath('div/div[7]/div[2]/div/div[1]/span[3]/a/@href').extract() yield item
i started scratch , following spider should run with
scrapy crawl amazon -t csv -o amazon.csv --loglevel=info
so opening csv-file spreadsheet shows me
hope helps :-)
import scrapy class amazonitem(scrapy.item): rating = scrapy.field() date = scrapy.field() review = scrapy.field() link = scrapy.field() class amazonspider(scrapy.spider): name = "amazon" allowed_domains = ['amazon.co.uk'] start_urls = ['http://www.amazon.co.uk/product-reviews/b0042eu3a2/' ] def parse(self, response): sel in response.xpath('//table[@id="productreviews"]//tr/td/div'): item = amazonitem() item['rating'] = sel.xpath('./div/span/span/span/text()').extract() item['date'] = sel.xpath('./div/span/nobr/text()').extract() item['review'] = sel.xpath('./div[@class="reviewtext"]/text()').extract() item['link'] = sel.xpath('.//a[contains(.,"permalink")]/@href').extract() yield item xpath_next_page = './/table[@id="productreviews"]/following::*//span[@class="paging"]/a[contains(.,"next")]/@href' if response.xpath(xpath_next_page): url_next_page = response.xpath(xpath_next_page).extract()[0] request = scrapy.request(url_next_page, callback=self.parse) yield request
Comments
Post a Comment