Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
python scrapy crawler for https://www.aldi.com.au/ to navigate through the subcategories of Groceries menu items.
  • Loading branch information
umerfsandhu authored Nov 10, 2021
1 parent d46a8f2 commit d3f3366
Show file tree
Hide file tree
Showing 2 changed files with 263 additions and 0 deletions.
47 changes: 47 additions & 0 deletions Aldi_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import scrapy

from .. import items
from ..items import AldiProjectItem
class AldiSpider(scrapy.Spider):
name = 'Aldi'
start_urls = [
'https://www.aldi.com.au/en/groceries/'
]
base_url = 'https://www.aldi.com.au/'

#extracted links for Subcategories
def parse(self,response):
for subCat in response.xpath('//div[@class="productworld--list-item ym-gl ym-g16"]/a/@href').extract():
yield response.follow(subCat, callback=self.parse_subcategories)

#extracted values of Product
def parse_subcategories(self,response):
productname = response.xpath('//div[@class="box--description--header"]/text()').extract()
product_title = [elem.strip('\t\n') for elem in productname] # removing extra spaces and lines
productImage= response.xpath('//div[@class="box m-text-image"]/div/div[1]/img/@src').extract()
packSize= response.xpath('//div[@class="box--price"]/span[@class="box--amount"]/text()').extract()
price= response.xpath('//div[@class="box--price"]/span[@class="box--value"]/text()').extract() #price in $
priDeci = response.xpath('//div[@class="box--price"]/span[@class="box--decimal"]/text()').extract() # price in cents

pricePerUnit = response.xpath('//div[@class="box--price"]/span[@class="box--baseprice"]/text()').extract()
# items['productTitle'] = productTitle
# items['productImage'] = productImage
#items['packSize'] = packSize
#items['price'] = price
#items['pricePerUnit'] = pricePerUnit
for item in zip(product_title, productImage, packSize, price,priDeci,pricePerUnit):
product = {

'product_title': item[0],
'productImage': item[1],
'packSize': item[2],
'price': item[3]+item[4],
'pricePerUnit': item[5]

}

yield product




Loading

0 comments on commit d3f3366

Please sign in to comment.