From 6c4ee3d50c27ecfd7d7bd3f29ddcbfd117726395 Mon Sep 17 00:00:00 2001 From: dragonzurfer Date: Thu, 10 Aug 2017 14:03:47 +0530 Subject: [PATCH 1/6] Get submissions --- ACedIt/main.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ACedIt/main.py b/ACedIt/main.py index 65eb620..a39b2cc 100644 --- a/ACedIt/main.py +++ b/ACedIt/main.py @@ -1,6 +1,6 @@ import sys import util - +import submission as sub supported_sites = ["codeforces", "codechef", "hackerrank", "spoj"] @@ -27,6 +27,11 @@ def main(): else: # fetch all problems for the contest util.Utilities.download_contest_testcases(args) + + if args["submission"] is not None: + # fetch submission + sub.Utilities.download_submission(args) + except KeyboardInterrupt: # Clean up files here print "Interruped manually. Exiting gracefully." From b112641c148a901da352802d478c08d0bd854cfb Mon Sep 17 00:00:00 2001 From: dragonzurfer Date: Thu, 10 Aug 2017 14:04:50 +0530 Subject: [PATCH 2/6] Add flags for submission, status and username --- ACedIt/util.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ACedIt/util.py b/ACedIt/util.py index 3255af9..3d4ae6c 100644 --- a/ACedIt/util.py +++ b/ACedIt/util.py @@ -45,6 +45,18 @@ def parse_flags(): action="store_true", help="Force download the test cases, even if they are cached") + parser.add_argument("-sub","--submission", + dest="submission", + help="The number of AC'd submissions") + + parser.add_argument("-status", + dest="status", + help="The submission status, e.g. AC, WA etc") + + parser.add_argument("-u","--username", + dest="username", + help="The username, e.g. rng_58, rajat1603 etc") + parser.set_defaults(force=False) args = parser.parse_args() @@ -70,6 +82,9 @@ def parse_flags(): flags["problem"] = args.problem flags["force"] = args.force flags["site"] = flags["site"].lower() + flags["submission"] = args.submission + flags["status"] = args.status + flags["username"] = args.username return flags From 655b5cfe217937c7377d39b621c271866a49e2ee Mon Sep 17 00:00:00 2001 From: dragonzurfer Date: Thu, 10 Aug 2017 14:05:18 +0530 Subject: [PATCH 3/6] Initial commit --- ACedIt/submission.py | 67 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 ACedIt/submission.py diff --git a/ACedIt/submission.py b/ACedIt/submission.py new file mode 100644 index 0000000..c19cb42 --- /dev/null +++ b/ACedIt/submission.py @@ -0,0 +1,67 @@ +import json +import re +import os +import util +try: + from bs4 import BeautifulSoup as bs + import requests as rq + import grequests as grq + from argparse import ArgumentParser +except: + err = """ + You haven't installed the required dependencies. + Run 'python setup.py install' to install the dependencies. + """ + print err + sys.exit(0) + + +class Utilities: + + cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "ACedIt") + + @staticmethod + def download_submission(args): + if args["site"] == "codechef": + platform = Codechef(args) + + platform.get_submission() + +class Codechef: + """ + Class to handle downloading of test cases from Codeforces + """ + + def __init__(self, args): + self.site = args["site"] + self.contest = args["contest"] + self.problem = args["problem"] + self.nos = int(args["submission"]) + self.status = args["status"] + self.status_dict = {"AC":"15", "WA":"14", "TLE":"13", "RTE":"12", "CTE":"11"} + if args["username"] is not None: + self.username = args["username"] + else: + self.username = "" + + def get_submission(self): + """ + Method to get submissions + """ + + print "Fetching submissions for " + self.contest + "-" + self.problem + " from Codechef..." + status = self.status_dict[self.status] + url = "https://www.codechef.com/"+ self.contest + "/status/" + self.problem + "?sort_by=Date%2FTime&sorting_order=asc&language=All&" "status=" + status + "&handle=" + self.username + res = util.Utilities.get_html(url) + soup = bs(res.text, 'html.parser') + cnt = 1 + + for link in soup.find_all('a'): + link = str(link.get('href')) + match = re.search(r'viewsolution',link) + if match: + print "https://codechef.com" + link + cnt+=1 + if cnt > self.nos: + break + From 4dd193697759a50bdf7370508e2a63cb573ef140 Mon Sep 17 00:00:00 2001 From: dragonzurfer Date: Wed, 23 Aug 2017 20:25:27 +0530 Subject: [PATCH 4/6] progressbar,lxml --- requirements.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index eed60a9..dcef59b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,14 @@ beautifulsoup4==4.6.0 certifi==2017.4.17 chardet==3.0.4 +futures==3.1.1 gevent==1.2.2 greenlet==0.4.12 grequests==0.3.0 idna==2.5 -pkg-resources==0.0.0 +lxml==3.8.0 +progressbar2==3.34.2 +python-utils==2.2.0 requests==2.18.1 +six==1.10.0 urllib3==1.21.1 From 2f0939d76f51a9f5f0d00aae1e94bf9e97d9d432 Mon Sep 17 00:00:00 2001 From: dragonzurfer Date: Wed, 23 Aug 2017 20:26:12 +0530 Subject: [PATCH 5/6] Add multithreaded submission scraping for Codeforces --- ACedIt/submission.py | 266 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 239 insertions(+), 27 deletions(-) diff --git a/ACedIt/submission.py b/ACedIt/submission.py index c19cb42..e4d514a 100644 --- a/ACedIt/submission.py +++ b/ACedIt/submission.py @@ -2,6 +2,9 @@ import re import os import util +import threading +import progressbar +import sys try: from bs4 import BeautifulSoup as bs import requests as rq @@ -20,14 +23,73 @@ class Utilities: cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "ACedIt") + @staticmethod + def store_code(site, contest, contestant, problem_code, code): + """ + Utility function to store code + """ + directory = os.path.join( + os.getcwd(), site, contest + "_submissions", contestant) + + if not os.path.exists(directory): + os.makedirs(directory) + + filename = os.path.join( + os.getcwd(), site, contest + "_submissions", contestant, problem_code) + + with open(filename, 'w') as handler: + try: + handler.write(code.encode('utf-8')) + except: + print "Failed to get " + contestant + "'s code for " + problem_code + @staticmethod def download_submission(args): - if args["site"] == "codechef": - platform = Codechef(args) - - platform.get_submission() + if args["site"] == "codeforces": + platform = Codeforces(args) + platform.get_all_submissions() + + @staticmethod + def get_html(url): + """ + Utility function get the html content of an url + """ + try: + r = rq.get(url) + except Exception as e: + sys.exit(0) + return r + + @staticmethod + def empty_pool(pool): + """ + Utility function to join all threads in pool + """ + for thread in pool: + thread.join() + return [] -class Codechef: + @staticmethod + def set_found_code(found_code): + """ + Utility function to create map for problem codes + """ + for key, value in found_code.items(): + found_code[key] = False + return found_code + + @staticmethod + def update_status(pbar, status_flag, status_lock): + """ + Utility function to update status bar while threading + """ + status_lock.acquire(True) + pbar.update(status_flag["previous"] + status_flag["increment"]) + status_flag["previous"] += status_flag["increment"] + status_lock.release() + + +class Codeforces(util.Codeforces): """ Class to handle downloading of test cases from Codeforces """ @@ -36,32 +98,182 @@ def __init__(self, args): self.site = args["site"] self.contest = args["contest"] self.problem = args["problem"] - self.nos = int(args["submission"]) - self.status = args["status"] - self.status_dict = {"AC":"15", "WA":"14", "TLE":"13", "RTE":"12", "CTE":"11"} - if args["username"] is not None: - self.username = args["username"] + self.nos = 0 + if args["submission"].isdigit(): + self.nos = int(args["submission"]) else: - self.username = "" + self.user = args["submission"] - def get_submission(self): + def get_rank_list(self): """ - Method to get submissions + Method to get the rank list of the contest """ + url = "http://codeforces.com/contest/" + self.contest + "/standings" + res = Utilities.get_html(url) + soup = bs(res.text, "html.parser") + rankList = [] + currentCount = 1 - print "Fetching submissions for " + self.contest + "-" + self.problem + " from Codechef..." - status = self.status_dict[self.status] - url = "https://www.codechef.com/"+ self.contest + "/status/" + self.problem + "?sort_by=Date%2FTime&sorting_order=asc&language=All&" "status=" + status + "&handle=" + self.username - res = util.Utilities.get_html(url) - soup = bs(res.text, 'html.parser') - cnt = 1 - - for link in soup.find_all('a'): - link = str(link.get('href')) - match = re.search(r'viewsolution',link) - if match: - print "https://codechef.com" + link - cnt+=1 - if cnt > self.nos: + for link in soup.find_all("a"): + link = str(link.get("href")) + try: + match = re.search("/profile/(.*)", link).group(1) + rankList.append(match) + currentCount += 1 + if currentCount > self.nos: break + except Exception as e: + continue + + return rankList + + def get_code(self, submission_id): + """ + Method to get the code for submission id of a contestant + """ + url = "http://codeforces.com/contest/" + \ + self.contest + "/submission/" + submission_id + res = Utilities.get_html(url) + soup = bs(res.text, 'lxml') + return soup.pre.string + + def get_all_pages(self, url, contestant): + """ + Method to get the entire history of submission pages by the contestant + """ + res = Utilities.get_html(url) + soup = bs(res.text, 'lxml') + span_set = soup.findAll("span", {"class": "page-index"}) + maxPageCount = 1 + + if len(span_set) is not 0: + span = span_set[-1] + maxPageCount = re.search('/page/(.*?)"', str(span)).group(1) + maxPageCount = int(maxPageCount) + + url = "http://codeforces.com/submissions/" + contestant + "/page/" + maxPageCount += 1 + pageLinks = [] + + for page_no in range(1, maxPageCount): + pageLinks.append(url + str(page_no)) + + responses = (grq.get(link) for link in pageLinks) + pages = grq.map(responses) + + return pages + + def get_valid_submissions(self, contestant, page, count, found_code): + """ + Method to get all the submissions that match the problem and contest code + """ + + return found_code + + def get_user_submission(self, contestant, found_code, pbar, status_lock, status_flag, single_user): + url = "http://codeforces.com/submissions/" + contestant + pages = self.get_all_pages(url, contestant) + submissions_found = False + count = len(found_code) + found_users_code = False + + for page in pages: + prevcount = count + + if count is 0 or page is None: + return + + soup = bs(page.text, "lxml") + tr_set = soup.find_all("tr") + + for tr in tr_set: + contest_details_res = re.search( + 'href="/problemset/problem/(.*?)"', str(tr)) + problem_status_res = re.search( + 'submissionverdict="(.*?)"', str(tr)) + submission_id_res = re.search('submissionid="(.*?)"', str(tr)) + + if contest_details_res and problem_status_res and submission_id_res: + contest_details = contest_details_res.group(1).split('/') + contest_code = contest_details[0] + pcode = contest_details[1] + problem_status = problem_status_res.group(1) + submission_id = submission_id_res.group(1) + filename = os.path.join( + os.getcwd(), "codeforces", self.contest + "_submissions", contestant, pcode) + if os.path.exists(filename): + found_users_code = True + continue + + if problem_status == "OK": + problem_status = "AC" + + if contest_code == self.contest and found_code[pcode] == False and problem_status == "AC": + try: + AC_code = self.get_code(submission_id) + except Exception as e: + continue + Utilities.store_code( + "codeforces", self.contest, contestant, pcode, AC_code) + found_code[pcode] = True + count -= 1 + + if single_user: + Utilities.update_status( + pbar, status_flag, status_lock) + found_users_code = True + + if not single_user: + Utilities.update_status(pbar, status_flag, status_lock) + + def get_all_submissions(self): + url = "http://codeforces.com/contest/" + self.contest + req = Utilities.get_html(url) + + try: + problem_links = self.get_problem_links(req) + except Exception as e: + print "Couldn't find submissions for the contest you were looking for" + return + + found_code = {} + + for link in problem_links: + pcode_res = re.search('problem/(.*)', link) + if pcode_res is not None: + pcode = pcode_res.group(1) + found_code[pcode] = False + else: + return + + status_flag = {"previous": 0, "increment": 1} + status_lock = threading.Lock() + + if self.nos is not 0: + print "Fetching submissions" + + try: + rankList = self.get_rank_list() + except Exception as e: + print "Couldn't find submissions for the contest you were looking for, Try again later" + return + + rankListCount = len(rankList) + pbar = progressbar.ProgressBar(max_value=rankListCount) + pool = [] + + for contestant in rankList: + found_code_copy = found_code.copy() + thread = threading.Thread(target=self.get_user_submission, args=( + contestant, found_code_copy, pbar, status_lock, status_flag, False,)) + thread.start() + pool.append(thread) + + if len(pool) > 10: + pool = Utilities.empty_pool(pool) + else: + pbar = progressbar.ProgressBar(max_value=len(found_code)) + print "Fetching " + self.user + "'s submissions" + self.get_user_submission( + self.user, found_code, pbar, status_lock, status_flag, True) From cc04a13354d1a0809d382855fbbacc52aead683a Mon Sep 17 00:00:00 2001 From: dragonzurfer Date: Wed, 23 Aug 2017 20:29:30 +0530 Subject: [PATCH 6/6] Modify '-sub' tag and remove username and status tags --- ACedIt/util.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/ACedIt/util.py b/ACedIt/util.py index 3d4ae6c..9fef291 100644 --- a/ACedIt/util.py +++ b/ACedIt/util.py @@ -47,15 +47,7 @@ def parse_flags(): parser.add_argument("-sub","--submission", dest="submission", - help="The number of AC'd submissions") - - parser.add_argument("-status", - dest="status", - help="The submission status, e.g. AC, WA etc") - - parser.add_argument("-u","--username", - dest="username", - help="The username, e.g. rng_58, rajat1603 etc") + help="The number of AC'd submissions or Username e.g. rng_58") parser.set_defaults(force=False) @@ -83,8 +75,6 @@ def parse_flags(): flags["force"] = args.force flags["site"] = flags["site"].lower() flags["submission"] = args.submission - flags["status"] = args.status - flags["username"] = args.username return flags