2

I have some problems with web scraping, here is my code:

from bs4 import BeautifulSoup

import requests
import re
import csv
import argparse

def save_csv_file(filename, array):
    with open(filename, 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(["item_name","item_price","item_category"])
        writer.writerows(array)

def process_data(name, price, category):

    item_name = name.text if name else 'NA'
    item_price = price.text if price else 'NA'
    item_category = category.text if category else 'NA'

    item_name = item_name.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
    item_price = item_price.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")
    item_category = item_category.replace(" ", "").replace("\r\n", "").replace("\n", "").encode("utf-8")

    return (item_name, item_price, item_category)

def do_scrap(filename, url, payload, headers):

    # Request the URL with parameters and headers
    r = requests.post(url, payload, headers = headers, allow_redirects = True)

    if(r.status_code == 200):

        # Save response content in html variable
        html = r.content

        # Parsed html variable into HTML file with bs4
        parsed_html = BeautifulSoup(html, "html.parser")

        # Print document title
        print parsed_html.head.find('title').text

        # Find all of the HTML elements which are describing hotels
        tables = parsed_html.find_all("a", {"class" : "result-link"})

        # Print the numbers of the hotels
        print "Found %s records." % len(tables)

        # Empty helpers
        items = []
        count = 0

        # Looping the HTML elements and print properties for each hotel
        for table in tables:
            name = table.find("h3", {"class" : "result-title"})
            price = table.find("p", {"class" : "price text-truncate"})
            category = table.find("p", {"class" : "merchant-name text-truncate"})

            items.append(process_data(name, price, category))
        count += 1

        if count > 0:
            # Save array with data to csv file
            save_csv_file(filename = filename, array = items)

            # Print end of job info
            print "\n%s records downloaded and saved to %s." % (count, filename)
    else:
        print "Code error: %s" % r.status_code

if __name__ == '__main__':

ap = argparse.ArgumentParser()

ap.add_argument("-p","--product",required=True,help="Product name")
ap.add_argument("-c","--category",default="",help="Product category")

args = vars(ap.parse_args())

product = args['product']
category = args['category']

payload = {
    'siteSearchQuery':product,
    'from':'colibri'
}

headers = {
    'Host':'www.kelkoo.co.uk',
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate, sdch',
    'Accept-Language':'pl-PL,pl;q=0.8,en-US;q=0.6,en;q=0.4',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36'
}

url = "http://www.kelkoo.co.uk/ctl/do/search"

filename = "%s_co_uk_kelkoo_data.csv" % product

do_scrap(
    filename=filename,
    url=url,
    payload=payload,
    headers=headers)

After this request I am getting different result than I put this:

www.kelkoo.co.uk/ctl/do/search?siteSearchQuery=nokia+130&from=colibri

into my web browser, what is causing this problem? Is there is something related to page redirection or something?

1

1 Answer 1

1

I can see multiple things that will cause you to get different results:

  • You initiate a POST not a GET. Lookup params for requests.get.
  • They use javascript to modify the page.
Sign up to request clarification or add additional context in comments.

1 Comment

Yeah, you are right so silly mistake ;/ I am feeling like a newbie ;) I only changed post to get and it is working perfectly.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.