python - Renaming downloaded images in Scrapy 0.24 with content from an item field while avoiding filename conflicts? -
i'm attempting rename images downloaded scrapy 0.24 spider. right downloaded images stored sha1 hash of urls file names. i'd instead name them value extract item['model']
. this question 2011 outlines want, answers previous versions of scrapy , don't work latest version.
once manage working i'll need make sure account different images being downloaded same filename. i'll need download each image own uniquely named folder, presumably based on original url.
here copy of code using in pipeline. i got code more recent answer in link above, it's not working me. nothing errors out , images downloaded normal. doesn't seem code has effect on filenames still appear sha1 hashes.
pipelines.py
class allenheathpipeline(object): def process_item(self, item, spider): return item import scrapy scrapy.contrib.pipeline.images import imagespipeline scrapy.http import request scrapy.exceptions import dropitem class myimagespipeline(imagespipeline): #name download version def file_path(self, request, response=none, info=none): item=request.meta['item'] # can use item, not url. image_guid = request.url.split('/')[-1] return 'full/%s' % (image_guid) #name thumbnail version def thumb_path(self, request, thumb_id, response=none, info=none): image_guid = thumb_id + request.url.split('/')[-1] return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid) def get_media_requests(self, item, info): #yield request(item['images']) # adding meta. dunno how put in 1 line :-) image in item['images']: yield request(image) def item_completed(self, results, item, info): image_paths = [x['path'] ok, x in results if ok] if not image_paths: raise dropitem("item contains no images") item['image_paths'] = image_paths return item
settings.py
bot_name = 'allenheath' spider_modules = ['allenheath.spiders'] newspider_module = 'allenheath.spiders' item_pipelines = {'scrapy.contrib.pipeline.images.imagespipeline': 1} images_store = 'c:/allenheath/images'
products.py (my spider)
import scrapy import urlparse allenheath.items import productitem scrapy.selector import selector scrapy.http import htmlresponse class productsspider(scrapy.spider): name = "products" allowed_domains = ["http://www.allen-heath.com/"] start_urls = [ "http://www.allen-heath.com/ahproducts/ilive-80/", "http://www.allen-heath.com/ahproducts/ilive-112/" ] def parse(self, response): sel in response.xpath('/html'): item = productitem() item['model'] = sel.css('#prodsingleouter > div > div > h2::text').extract() # value i'd use name images. item['shortdesc'] = sel.css('#prodsingleouter > div > div > h3::text').extract() item['desc'] = sel.css('#tab1 #productcontent').extract() item['series'] = sel.css('#pagestrip > div > div > a:nth-child(3)::text').extract() item['imageorig'] = sel.css('#prodsingleouter > div > div > h2::text').extract() item['image_urls'] = sel.css('#tab1 #productcontent .col-sm-9 img').xpath('./@src').extract() item['image_urls'] = [urlparse.urljoin(response.url, url) url in item['image_urls']] yield item
items.py
import scrapy class productitem(scrapy.item): model = scrapy.field() itemcode = scrapy.field() shortdesc = scrapy.field() desc = scrapy.field() series = scrapy.field() imageorig = scrapy.field() image_urls = scrapy.field() images = scrapy.field()
here's pastebin of output command prompt when run spider: http://pastebin.com/ir7yzfqf
any appreciated!
the pipelines.py:
from scrapy.contrib.pipeline.images import imagespipeline scrapy.http import request scrapy.exceptions import dropitem scrapy import log class myimagespipeline(imagespipeline): #name download version def file_path(self, request, response=none, info=none): image_guid = request.meta['model'][0] log.msg(image_guid, level=log.debug) return 'full/%s' % (image_guid) #name thumbnail version def thumb_path(self, request, thumb_id, response=none, info=none): image_guid = thumb_id + request.url.split('/')[-1] log.msg(image_guid, level=log.debug) return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid) def get_media_requests(self, item, info): yield request(item['image_urls'][0], meta=item)
you're using settings.py
wrong. should use this:
item_pipelines = {'allenheath.pipelines.myimagespipeline': 1}
for thumbsnails work, add settings.py
:
images_thumbs = { 'small': (50, 50), 'big': (100, 100), }
Comments
Post a Comment