Web scraping dynamic website with scrapy and query string parameters

Question

We aim at obtaining data from "vivino (dot) com".

Specifically, for each wine, we need both user ratings and reviews. Unfortunately, we are coping with issues with query string parameters and infinite scroll and callbacks of the functions. We got the following error:

Traceback (most recent call last):
  File "/Users/utente/opt/anaconda3/lib/python3.8/site-packages/twisted/internet/defer.py", line 654, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "/Users/utente/opt/anaconda3/lib/python3.8/site-packages/scrapy/spiders/__init__.py", line 90, in _parse
    return self.parse(response, **kwargs)
  File "/Users/utente/opt/anaconda3/lib/python3.8/site-packages/scrapy/spiders/__init__.py", line 93, in parse
    raise NotImplementedError(f'{self.__class__.__name__}.parse callback is not defined')
NotImplementedError: ScrollSpider.parse callback is not defined

Please see attached code:

    import scrapy
    import json

    class ScrollSpider(scrapy.Spider):
        name = 'scroll'
        start_urls = ["https://www.vivino.com/IT/en/"]
    
        headers={
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36 OPR/74.0.3911.107",
            "Accept": "application/json",
            "Content-Type": "application/json",
            "Accept-Encoding": "gzip, deflate, be",
            "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
        }

        query_string = {
            "country_code": "IT",
            "currency_code":"EUR",
            "grape_filter":"varietal",
            "min_rating":"1",
            "order_by":"price",
            "order":"asc",
            "page": 1,
            "price_range_max":"500",
            "price_range_min":"1",
            "region_ids[]":"394",
        }


        def parse_explore(self, response): # go to explore page
        yield scrapy.Request(
            url = "https://www.vivino.com/explore",
            headers=self.headers,
            body=self.query_string,
            callback=self.parse_products,
        )

for each query result build request

    def parse_products(self, response): 
        raw_json = response.body
        data = json.loads(raw_json)
        
        for wine in data:
            wine_id = wine['explore_vintage']['matches'][2]['vintage']['wine']['id']
            wine_price_id = wine['explore_vintage']['matches'][2]['price']['id']
            wine_year_id = wine['explore_vintage']['matches'][2]['vintage']['year']
            wine_name_id = wine['explore_vintage']['matches'][2]['vintage']['seo_name']
            yield scrapy.Request(
                f"https://www.vivino.com/IT/en/{wine_name_id}/w/{wine_id}?year={wine_year_id}&price_id={wine_price_id}",
                callback=self.parse_wine,
                headers=self.headers,
            )

for each query result build product link

     def parse_wine(self, response): 
            raw_json = response.body
            data = json.loads(raw_json)
            wine_id = data['reviews'][0]['vintage']['wine']['id']
            wine_year_id = data['reviews'][0]['vintage']['year']
        yield scrapy.Request(
            url = f"https://www.vivino.com/api/wines/{wine_id}/reviews?per_page=1000&page=1&year={wine_year_id}", # 1000 per page
            headers=self.headers,
            callback=self.parse_review,
            )

        def parse_review(self, response): # look for product rating and review
        reviews = json.loads(response.body)
        for review in reviews:
            yield {
                "rating": review['reviews'][0]['rating'],
                "review": review['reviews'][0]['note'],
            }

Gustavo de Rosa · Accepted Answer · 2021-03-19 18:08:21Z

Unfortunately, Vivino limits the amount of per_page parameter to 50 on the https://www.vivino.com/api/wines/{wine_id}/reviews endpoint, so you will need to iterate over the pages as well.

You can accomplish the same thing with requests and it's more straightforward (in my opinion):

import requests

# Instantiate a dictionary of headers
# We only need to `manipulate` an User-Agent key
headers = {
    "User-Agent": ""
}

# Instantiate a dictionary of query strings
# Defines the only needed payload
payload = {
    "min_rating": 1,
    "order_by": "price",
    "order": "asc",
    "price_range_max": 500,
    "price_range_min": 1,
    "region_ids[]": 394
}

# Performs an initial request and gathers the amount of results
r = requests.get('https://www.vivino.com/api/explore/explore?',
                 params=payload, headers=headers)
n_matches = r.json()['explore_vintage']['records_matched']

# Iterates through the amount of possible pages
# A page is defined by n_matches divided by 25 (number of results per page)
for i in range(int(n_matches / 25)):
    # Adds the page on the payload
    payload['page'] = i + 1

    print(f'Requesting data from page: {payload["page"]}')

    # Performs the request and saves the matches
    r = requests.get('https://www.vivino.com/api/explore/explore?',
                     params=payload, headers=headers)
    matches = r.json()['explore_vintage']['matches']

    # Iterates through every match
    for match in matches:
        # Defines the wine's identifier
        _id = match['vintage']['wine']['id']

        # Defines a page counter
        page_counter = 1

        # Performs an all-time true loop
        while True:
            print(f'Requesting reviews from wine: {_id} and page: {page_counter}')

            # Performs the request and saves the reviews
            r = requests.get(f'https://www.vivino.com/api/wines/{_id}/reviews?per_page=50&page={page_counter}',
                             headers=headers)
            reviews = r.json()['reviews']

            print(f'Number of reviews: {len(reviews)}')

            # If there are no reviews anymore,
            # it indicates that the loop can be broken
            if len(reviews) == 0:
                # Breaks the loop
                break

            # Otherwise, increments the counter
            page_counter += 1

Sorry for the all-in-one script and maybe some repetitive definitions, I just did in a hurry to check if it would work.

Best regards, Gustavo.

Collectives™ on Stack Overflow

Web scraping dynamic website with scrapy and query string parameters

1 Answer 1

Comments

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

Comments

Your Answer

Sign up or log in

Post as a guest

Related