web scraper python

Solutions on MaxInterview for web scraper python by the best coders in the world

showing results for - "web scraper python"

27 Feb 2016

1from requests import get
2from requests.exceptions import RequestException
3from contextlib import closing
4from bs4 import BeautifulSoup
5

source

Ibtissem

11 Mar 2019

1>>> from bs4 import BeautifulSoup
2>>> raw_html = open('contrived.html').read()
3>>> html = BeautifulSoup(raw_html, 'html.parser')
4>>> for p in html.select('p'):
5...     if p['id'] == 'walrus':
6...         print(p.text)
7
8'I am the walrus'
9

source

Ricardo

03 Mar 2016

1>>> raw_html = simple_get('http://www.fabpedigree.com/james/mathmen.htm')
2>>> html = BeautifulSoup(raw_html, 'html.parser')
3>>> for i, li in enumerate(html.select('li')):
4        print(i, li.text)
5
60  Isaac Newton
7 Archimedes
8 Carl F. Gauss
9 Leonhard Euler
10 Bernhard Riemann
11
121  Archimedes
13 Carl F. Gauss
14 Leonhard Euler
15 Bernhard Riemann
16
172  Carl F. Gauss
18 Leonhard Euler 
19 Bernhard Riemann
20
21 3  Leonhard Euler
22 Bernhard Riemann
23
244  Bernhard Riemann
25
26# 5 ... and many more...
27

source

Khalil

30 Sep 2016

1def get_names():
2    """
3    Downloads the page where the list of mathematicians is found
4    and returns a list of strings, one per mathematician
5    """
6    url = 'http://www.fabpedigree.com/james/mathmen.htm'
7    response = simple_get(url)
8
9    if response is not None:
10        html = BeautifulSoup(response, 'html.parser')
11        names = set()
12        for li in html.select('li'):
13            for name in li.text.split('\n'):
14                if len(name) > 0:
15                    names.add(name.strip())
16        return list(names)
17
18    # Raise an exception if we failed to get any data from the url
19    raise Exception('Error retrieving contents at {}'.format(url))
20

source

Delores

04 May 2017

1def get_hits_on_name(name):
2    """
3    Accepts a `name` of a mathematician and returns the number
4    of hits that mathematician's Wikipedia page received in the 
5    last 60 days, as an `int`
6    """
7    # url_root is a template string that is used to build a URL.
8    url_root = 'URL_REMOVED_SEE_NOTICE_AT_START_OF_ARTICLE'
9    response = simple_get(url_root.format(name))
10
11    if response is not None:
12        html = BeautifulSoup(response, 'html.parser')
13
14        hit_link = [a for a in html.select('a')
15                    if a['href'].find('latest-60') > -1]
16
17        if len(hit_link) > 0:
18            # Strip commas
19            link_text = hit_link[0].text.replace(',', '')
20            try:
21                # Convert to integer
22                return int(link_text)
23            except:
24                log_error("couldn't parse {} as an `int`".format(link_text))
25
26    log_error('No pageviews found for {}'.format(name))
27    return None
28

source

Tom

12 Feb 2016

1def simple_get(url):
2    """
3    Attempts to get the content at `url` by making an HTTP GET request.
4    If the content-type of response is some kind of HTML/XML, return the
5    text content, otherwise return None.
6    """
7    try:
8        with closing(get(url, stream=True)) as resp:
9            if is_good_response(resp):
10                return resp.content
11            else:
12                return None
13
14    except RequestException as e:
15        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
16        return None
17
18
19def is_good_response(resp):
20    """
21    Returns True if the response seems to be HTML, False otherwise.
22    """
23    content_type = resp.headers['Content-Type'].lower()
24    return (resp.status_code == 200 
25            and content_type is not None 
26            and content_type.find('html') > -1)
27
28
29def log_error(e):
30    """
31    It is always a good idea to log errors. 
32    This function just prints them, but you can
33    make it do anything.
34    """
35    print(e)
36

source

similar questions

scrape phone numbers from website python scrape all the p tags in a python use beautifulsoup or scrapy to scrape a book store best scraping package in python python web crawler python web scraping python download from mediafire with scraping how to scrape data from a html page saved locally python scrape data from aspx page web scraping using python code web scraping with python beautifulsoup scraping list from html python selenium facebook scraper python proxy scraper python web scraping project python selenium web scraping example web scraping python python scrape filedropper

queries leading to this page