Scrape page using Python requests

Question

I have some problems with web scraping, here is my code:

from bs4 import BeautifulSoup

import requests
import re
import csv
import argparse

def save_csv_file(filename, array):
    with open(filename, 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(["item_name","item_price","item_category"])
        writer.writerows(array)

def process_data(name, price, category):

    item_name = name.text if name else 'NA'
    item_price = price.text if price else 'NA'
    item_category = category.text if category else 'NA'

    item_name = item_name.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
    item_price = item_price.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
    item_category = item_category.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")

    return (item_name, item_price, item_category)

def do_scrap(filename, url, payload, headers):

    # Request the URL with parameters and headers
    r = requests.post(url, payload, headers = headers, allow_redirects = True)

    if(r.status_code == 200):

        # Save response content in html variable
        html = r.content

        # Parsed html variable into HTML file with bs4
        parsed_html = BeautifulSoup(html, "html.parser")

        # Print document title
        print parsed_html.head.find('title').text

        # Find all of the HTML elements which are describing hotels
        tables = parsed_html.find_all("a", {"class" : "result-link"})

        # Print the numbers of the hotels
        print "Found %s records." % len(tables)

        # Empty helpers
        items = []
        count = 0

        # Looping the HTML elements and print properties for each hotel
        for table in tables:
            name = table.find("h3", {"class" : "result-title"})
            price = table.find("p", {"class" : "price text-truncate"})
            category = table.find("p", {"class" : "merchant-name text-truncate"})

            items.append(process_data(name, price, category))
        count += 1

        if count > 0:
            # Save array with data to csv file
            save_csv_file(filename = filename, array = items)

            # Print end of job info
            print "\n%s records downloaded and saved to %s." % (count, filename)
    else:
        print "Code error: %s" % r.status_code

if __name__ == '__main__':

ap = argparse.ArgumentParser()

ap.add_argument("-p","--product",required=True,help="Product name")
ap.add_argument("-c","--category",default="",help="Product category")

args = vars(ap.parse_args())

product = args['product']
category = args['category']

payload = {
    'siteSearchQuery':product,
    'from':'colibri'
}

headers = {
    'Host':'www.kelkoo.co.uk',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch',
    'Accept-Language':'pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}

url = "http://www.kelkoo.co.uk/ctl/do/search"

filename = "%s_co_uk_kelkoo_data.csv" % product

do_scrap(
    filename=filename,
    url=url,
    payload=payload,
    headers=headers)

After this request I am getting different result than I put this:

www.kelkoo.co.uk/ctl/do/search?siteSearchQuery=nokia+130&from=colibri

into my web browser, what is causing this problem? Is there is something related to page redirection or something?

If redirection is an issue, then this answer could help. stackoverflow.com/questions/16974321/… — blackmamba
– blackmamba, Commented Oct 25, 2015 at 17:39

Aske Doerge · Accepted Answer · 2015-10-25 17:40:36Z

1

I can see multiple things that will cause you to get different results:

You initiate a POST not a GET. Lookup params for requests.get.
They use javascript to modify the page.

answered Oct 25, 2015 at 17:40

Aske Doerge

1,39110 silver badges17 bronze badges

Sign up to request clarification or add additional context in comments.

1 Comment

Tomasz Kowalczyk Over a year ago

Yeah, you are right so silly mistake ;/ I am feeling like a newbie ;) I only changed post to get and it is working perfectly.

Collectives™ on Stack Overflow

Scrape page using Python requests

1 Answer 1

1 Comment

Your Answer

Linked

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

1 Comment

Your Answer

Sign up or log in

Post as a guest

Linked

Related