Skip to content

Commit 0c6bae6

Browse files
committed
Started solving chapter 11
1 parent c9cd063 commit 0c6bae6

File tree

6 files changed

+7624
-0
lines changed

6 files changed

+7624
-0
lines changed

Chapter 11/Exercise11_1.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from urllib.request import urlopen
2+
def news(url, words):
3+
response = urlopen(url)
4+
html = response.read()
5+
html = html.decode()
6+
word = []
7+
for wor in words:
8+
word.append([wor, html.count(wor)])
9+
for couple in word:
10+
print ('{} appears {} times.'.format(couple[0], couple[1]))
11+

Chapter 11/Exercise11_2.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from html.parser import HTMLParser
2+
class MyHTMLParser(HTMLParser):
3+
4+
def __init__(self):
5+
HTMLParser.__init__(self)
6+
self.space = 0
7+
8+
def handle_starttag(self, tag, attrs):
9+
10+
if tag not in {'br', 'p'}:
11+
print('{} {} start'.format(' ' *self.space, tag))
12+
self.space += 4
13+
14+
def handle_endtag(self, tag):
15+
16+
if tag not in {'br', 'p'}:
17+
self.space -= 4
18+
print('{} {} end'.format(' ' *self.space, tag))
19+

Chapter 11/ch11.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
from urllib.request import urlopen
2+
def getSource(url):
3+
'returns the content of resource specified by url as a string'
4+
response = urlopen(url)
5+
html = response.read()
6+
return html.decode()
7+
8+
9+
10+
from html.parser import HTMLParser
11+
class LinkParser(HTMLParser):
12+
'''HTML doc parser that prints values of
13+
href attributes in anchor start tags'''
14+
15+
def handle_starttag(self, tag, attrs):
16+
'print value of href attribute if any'
17+
18+
if tag == 'a': # if anchor tag
19+
20+
# search for href attribute and print its value
21+
for attr in attrs:
22+
if attr[0] == 'href':
23+
print(attr[1])
24+
25+
26+
27+
from urllib.parse import urljoin
28+
from html.parser import HTMLParser
29+
class Collector(HTMLParser):
30+
'collects hyperlink URLs into a list'
31+
32+
def __init__(self, url):
33+
'initializes parser, the url, and a list'
34+
HTMLParser.__init__(self)
35+
self.url = url
36+
self.links = []
37+
38+
# Solution to Practice Problem 11.3
39+
self.text = ''
40+
41+
def handle_starttag(self, tag, attrs):
42+
'collects hyperlink URLs in their absolute format'
43+
if tag == 'a':
44+
for attr in attrs:
45+
if attr[0] == 'href':
46+
# construct absolute URL
47+
absolute = urljoin(self.url, attr[1])
48+
if absolute[:4] == 'http': # collect HTTP URLs
49+
self.links.append(absolute)
50+
51+
# Solution to Practice Problem 11.3
52+
def handle_data(self, data):
53+
'collects and concatenates text data'
54+
self.text += data
55+
56+
def getLinks(self):
57+
'returns hyperlinks URLs in their absolute format'
58+
return self.links
59+
60+
# Solution to Practice Problem 11.3
61+
def getData(self):
62+
'returns the concatenation of all text data'
63+
return self.text
64+
65+
66+
67+
##################################
68+
# Solutions to Practice Problems #
69+
##################################
70+
71+
72+
# Practice Problem 11.1
73+
from urllib.request import urlopen
74+
def news(url, topics):
75+
'''counts in resource with URL url the frequency
76+
of each topic in list topics'''
77+
# download and decode resource to obtain all lowercase content
78+
response = urlopen(url)
79+
html = response.read()
80+
content = html.decode().lower()
81+
82+
for topic in topics: # find frequency of topic in content
83+
n = content.count(topic)
84+
print('{} appears {} times.'.format(topic, n))
85+
86+
87+
# Practice Problem 11.2
88+
from html.parser import HTMLParser
89+
class MyHTMLParser(HTMLParser):
90+
'HTML doc parser that prints tags indented by depth'
91+
92+
def __init__(self):
93+
'initializes the parser and the initial indentation'
94+
HTMLParser.__init__(self)
95+
self.indent = 0 # initial indentation value
96+
97+
def handle_starttag(self, tag, attrs):
98+
'''prints start tag with an indentation proportional
99+
to the depth of the tag's element in the document'''
100+
if tag not in {'br', 'p'}:
101+
print('{}{} start'.format(self.indent*' ', tag))
102+
self.indent += 4
103+
104+
def handle_endtag(self, tag):
105+
'''prints end tag with an indentation proportional
106+
to the depth of the tag's element in the document'''
107+
if tag not in {'br','p'}:
108+
self.indent -= 4
109+
print('{}{} end'.format(self.indent*' ', tag))
110+
111+
112+
# Practice Problem 11.6
113+
from re import findall
114+
def frequency(content):
115+
'''returns dictionary containing frequencies
116+
of words in string content'''
117+
pattern = '[a-zA-Z]+'
118+
words = findall(pattern, content)
119+
dictionary = {}
120+
for w in words:
121+
if w in dictionary:
122+
dictionary[w] +=1
123+
else:
124+
dictionary[w] = 1
125+
return dictionary
126+

0 commit comments

Comments
 (0)