I think I have already solved this issue!
You can run the code below on Google Colaboratory. Feel free to ask questions further if needed. Cheers!
This code saves tables from multiple URLs into Excel Sheets.
#for writing to excel(xlsx) we will be needing XlsxWriter, please install it first if you don't have it!
try:
import XlsxWriter
except ModuleNotFoundError:
print("XlsxWriter is not installed!!")
get_ipython().system("pip install XlsxWriter")
#to scrape a table from a webpage
from urllib.parse import urlparse,urlsplit
import requests
import pandas as pd
import os
urls=["https://www.macrotrends.net/countries/IND/india/gdp-growth-rate",
"http://www.inwea.org/wind-energy-in-india/wind-power-potential",
"https://en.wikipedia.org/wiki/List_of_districts_in_India",
"https://en.wikipedia.org/wiki/List_of_Indian_people_by_net_worth",
"https://en.wikipedia.org/wiki/States_and_union_territories_of_India",
"https://en.wikipedia.org/wiki/List_of_governors-general_of_India",
"https://en.wikipedia.org/wiki/List_of_Indian_independence_activists",
"https://en.wikipedia.org/wiki/List_of_Indian_Grammy_Award_winners_and_nominees",
"https://en.wikipedia.org/wiki/List_of_Indian_Academy_Award_winners_and_nominees",
"https://en.wikipedia.org/wiki/List_of_highest-grossing_Indian_films"
]
print(len(urls),"Urls Found")
#convert the sheetname- remove _ and - , put title case and remove spaces
def modify_name(my_str):
replaced=my_str.replace("_", " ").replace("-", " ")
return replaced.title().replace(" ","")
#get all tables from a url
def get_dataframes(url):
html = requests.get(url).content
df_list = pd.read_html(html)
#print(len(df_list)," Dataframes Returned")
return df_list
#if df is too small then don't add it
def filter_dfs(dfs_list,min_rows=10):
new_dfs_list=[]
for each_df in dfs_list:
if(len(each_df)>min_rows):
new_dfs_list.append(each_df)
return new_dfs_list
#to avoid InvalidWorksheetName: Excel worksheet name 'StatesAndUnionTerritoriesOfIndia1' must be <= 31 chars.
def crop_name(name,thres=29):
if len(name)<thres:
return name
else:
return name[:thres]
#to get first n elements from list only
def crop_list(lst,thres=29):
if len(lst)<thres:
return lst
else:
return lst[:thres]
#converts urls to dataframes to excel sheets
#get_max= get the maximum number of tables from each url
#min_rows= the minimum number of rows in each table to save it to the excel sheet
#crop_name_thres= some excel sheets can get quite huge sheet names which blows up the code
#so crop the sheet name for the better purpose
def urls_to_excel(urls,excel_path=None,get_max=10,min_rows=0,crop_name_thres=29):
excel_path=os.path.join(os.getcwd(),"Excel_Multiple_Sheets_Output.xlsx") if excel_path==None else excel_path
writer = pd.ExcelWriter(excel_path, engine='xlsxwriter')
i=0
for url in urls:
parsed=urlsplit(url)
sheet_name=parsed.path.split('/')[-1]
mod_sheet_name=crop_name(modify_name(sheet_name),thres=crop_name_thres)
dfs_list=get_dataframes(url)
filtered_dfs_list=filter_dfs(dfs_list,min_rows=min_rows)
filtered_dfs_list=crop_list(filtered_dfs_list,thres=get_max)
for each_df in filtered_dfs_list:
print("Parsing Excel Sheet "," : ",str(i)+mod_sheet_name)
i+=1
each_df.to_excel(writer, sheet_name=str(i)+mod_sheet_name, index=True)
writer.save()
urls_to_excel(urls,get_max=1,min_rows=10)
Obtained Output after running:
XlsxWriter is not installed!!
Collecting XlsxWriter
Downloading https://files.pythonhosted.org/packages/2b/98/17875723b6814fc4d0fc03f0997ee00de2dbd78cf195e2ec3f2c9c789d40/XlsxWriter-1.3.3-py2.py3-none-any.whl (144kB)
|████████████████████████████████| 153kB 9.4MB/s
Installing collected packages: XlsxWriter
Successfully installed XlsxWriter-1.3.3
10 Urls Found
Parsing Excel Sheet : 0GdpGrowthRate
Parsing Excel Sheet : 1WindPowerPotential
Parsing Excel Sheet : 2ListOfDistrictsInIndia
Parsing Excel Sheet : 3ListOfIndianPeopleByNetWorth
Parsing Excel Sheet : 4StatesAndUnionTerritoriesOfIn
Parsing Excel Sheet : 5ListOfGovernorsGeneralOfIndia
Parsing Excel Sheet : 6ListOfIndianIndependenceActiv
Parsing Excel Sheet : 7ListOfIndianGrammyAwardWinner
Parsing Excel Sheet : 8ListOfIndianAcademyAwardWinne
Parsing Excel Sheet : 9ListOfHighestGrossingIndianFi