1from requests import get
2from requests.exceptions import RequestException
3from contextlib import closing
4from bs4 import BeautifulSoup
5
1>>> from bs4 import BeautifulSoup
2>>> raw_html = open('contrived.html').read()
3>>> html = BeautifulSoup(raw_html, 'html.parser')
4>>> for p in html.select('p'):
5... if p['id'] == 'walrus':
6... print(p.text)
7
8'I am the walrus'
9
1>>> raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm')
2>>> html = BeautifulSoup(raw_html, 'html.parser')
3>>> for i, li in enumerate(html.select('li')):
4 print(i, li.text)
5
60 Isaac Newton
7 Archimedes
8 Carl F. Gauss
9 Leonhard Euler
10 Bernhard Riemann
11
121 Archimedes
13 Carl F. Gauss
14 Leonhard Euler
15 Bernhard Riemann
16
172 Carl F. Gauss
18 Leonhard Euler
19 Bernhard Riemann
20
21 3 Leonhard Euler
22 Bernhard Riemann
23
244 Bernhard Riemann
25
26# 5 ... and many more...
27
1def get_names():
2 """
3 Downloads the page where the list of mathematicians is found
4 and returns a list of strings, one per mathematician
5 """
6 url = 'http://www.fabpedigree.com/james/mathmen.htm'
7 response = simple_get(url)
8
9 if response is not None:
10 html = BeautifulSoup(response, 'html.parser')
11 names = set()
12 for li in html.select('li'):
13 for name in li.text.split('\n'):
14 if len(name) > 0:
15 names.add(name.strip())
16 return list(names)
17
18 # Raise an exception if we failed to get any data from the url
19 raise Exception('Error retrieving contents at {}'.format(url))
20
1def get_hits_on_name(name):
2 """
3 Accepts a `name` of a mathematician and returns the number
4 of hits that mathematician's Wikipedia page received in the
5 last 60 days, as an `int`
6 """
7 # url_root is a template string that is used to build a URL.
8 url_root = 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE'
9 response = simple_get(url_root.format(name))
10
11 if response is not None:
12 html = BeautifulSoup(response, 'html.parser')
13
14 hit_link = [a for a in html.select('a')
15 if a['href'].find('latest-60') > -1]
16
17 if len(hit_link) > 0:
18 # Strip commas
19 link_text = hit_link[0].text.replace(',', '')
20 try:
21 # Convert to integer
22 return int(link_text)
23 except:
24 log_error("couldn't parse {} as an `int`".format(link_text))
25
26 log_error('No pageviews found for {}'.format(name))
27 return None
28
1def simple_get(url):
2 """
3 Attempts to get the content at `url` by making an HTTP GET request.
4 If the content-type of response is some kind of HTML/XML, return the
5 text content, otherwise return None.
6 """
7 try:
8 with closing(get(url, stream=True)) as resp:
9 if is_good_response(resp):
10 return resp.content
11 else:
12 return None
13
14 except RequestException as e:
15 log_error('Error during requests to {0} : {1}'.format(url, str(e)))
16 return None
17
18
19def is_good_response(resp):
20 """
21 Returns True if the response seems to be HTML, False otherwise.
22 """
23 content_type = resp.headers['Content-Type'].lower()
24 return (resp.status_code == 200
25 and content_type is not None
26 and content_type.find('html') > -1)
27
28
29def log_error(e):
30 """
31 It is always a good idea to log errors.
32 This function just prints them, but you can
33 make it do anything.
34 """
35 print(e)
36