We aim at obtaining data from "vivino (dot) com".
Specifically, for each wine, we need both user ratings and reviews. Unfortunately, we are coping with issues with query string parameters and infinite scroll and callbacks of the functions. We got the following error:
Traceback (most recent call last):
File "/Users/utente/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/defer.py", line 654, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/utente/opt/anaconda3/lib/python3.8/site-packages/scrapy/spiders/__init__.py", line 90, in _parse
return self.parse(response, **kwargs)
File "/Users/utente/opt/anaconda3/lib/python3.8/site-packages/scrapy/spiders/__init__.py", line 93, in parse
raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined')
NotImplementedError: ScrollSpider.parse callback is not defined
Please see attached code:
import scrapy
import json
class ScrollSpider(scrapy.Spider):
name = 'scroll'
start_urls = ["https://www.vivino.com/IT/en/"]
headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 OPR/74.0.3911.107",
"Accept": "application/json",
"Content-Type": "application/json",
"Accept-Encoding": "gzip, deflate, be",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
}
query_string = {
"country_code": "IT",
"currency_code":"EUR",
"grape_filter":"varietal",
"min_rating":"1",
"order_by":"price",
"order":"asc",
"page": 1,
"price_range_max":"500",
"price_range_min":"1",
"region_ids[]":"394",
}
def parse_explore(self, response): # go to explore page
yield scrapy.Request(
url = "https://www.vivino.com/explore",
headers=self.headers,
body=self.query_string,
callback=self.parse_products,
)
for each query result build request
def parse_products(self, response):
raw_json = response.body
data = json.loads(raw_json)
for wine in data:
wine_id = wine['explore_vintage']['matches'][2]['vintage']['wine']['id']
wine_price_id = wine['explore_vintage']['matches'][2]['price']['id']
wine_year_id = wine['explore_vintage']['matches'][2]['vintage']['year']
wine_name_id = wine['explore_vintage']['matches'][2]['vintage']['seo_name']
yield scrapy.Request(
f"https://www.vivino.com/IT/en/{wine_name_id}/w/{wine_id}?year={wine_year_id}&price_id={wine_price_id}",
callback=self.parse_wine,
headers=self.headers,
)
for each query result build product link
def parse_wine(self, response):
raw_json = response.body
data = json.loads(raw_json)
wine_id = data['reviews'][0]['vintage']['wine']['id']
wine_year_id = data['reviews'][0]['vintage']['year']
yield scrapy.Request(
url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=1000&page=1&year={wine_year_id}", # 1000 per page
headers=self.headers,
callback=self.parse_review,
)
def parse_review(self, response): # look for product rating and review
reviews = json.loads(response.body)
for review in reviews:
yield {
"rating": review['reviews'][0]['rating'],
"review": review['reviews'][0]['note'],
}