1import requests
2from bs4 import BeautifulSoup
3
4URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
5page = requests.get(URL)
6
7soup = BeautifulSoup(page.content, 'html.parser')
8
1#pip install beautifulsoup4
2
3import os
4import requests
5from bs4 import BeautifulSoup
6
7url = "https://www.google.com/"
8reponse = requests.get(url)
9
10if reponse.ok:
11 soup = BeautifulSoup(reponse.text, "lxml")
12 title = str(soup.find("title"))
13
14 title = title.replace("<title>", "")
15 title = title.replace("</title>", "")
16 print("The title is : " + str(title))
17
18os.system("pause")
19
20#python (code name).py
1# basic web scraping with python
2# Import libraries
3import requests
4import urllib.request
5import time
6from bs4 import BeautifulSoup
7
8# Set the URL you want to webscrape from
9url = 'http://web.mta.info/developers/turnstile.html'
10
11# Connect to the URL
12response = requests.get(url)
13
14# Parse HTML and save to BeautifulSoup object¶
15soup = BeautifulSoup(response.text, "html.parser")
16
17# To download the whole data set, let's do a for loop through all a tags
18line_count = 1 #variable to track what line you are on
19for one_a_tag in soup.findAll('a'): #'a' tags are for links
20 if line_count >= 36: #code for text files starts at line 36
21 link = one_a_tag['href']
22 download_url = 'http://web.mta.info/developers/'+ link
23 urllib.request.urlretrieve(download_url,'./'+link[link.find('/turnstile_')+1:])
24 time.sleep(1) #pause the code for a sec
25 #add 1 for next line
26 line_count +=1
1import scrapy
2from ..items import SampletestItem #items class
3
4class QuoteTestSpider(scrapy.Spider):
5 name = 'quote_test'
6 start_urls = ['https://quotes.toscrape.com/']
7
8 def parse(self, response):
9 items = SampletestItem() #items class
10 quotes = response.css("div.quote")
11 for quote in quotes:
12 items['title'] = quote.css("span.text::text").get()
13 items['author'] = quote.css(".author::text").get()
14 items['tags'] = quote.css(".tags .tag::text").getall()
15
16 yield items
17 next_page = response.css(".next a::attr(href)").get()
18 if next_page is not None:
19 next_url = response.urljoin(next_page)
20 yield scrapy.Request(next_url, callback=self.parse)
1# example of web scraping links using asyncio and using all cores
2import asyncio, requests, aiohttp, os
3from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
4from bs4 import BeautifulSoup as BS
5
6executor = ThreadPoolExecutor(max_workers=8)
7loop = asyncio.get_event_loop()
8
9async def make_requests():
10 urls = ['http://www.filedropper.com/lister.php?id=0', 'http://www.filedropper.com/lister.php?id=1', 'http://www.filedropper.com/lister.php?id=2', 'http://www.filedropper.com/lister.php?id=3', 'http://www.filedropper.com/lister.php?id=4', 'http://www.filedropper.com/lister.php?id=5', 'http://www.filedropper.com/lister.php?id=6', 'http://www.filedropper.com/lister.php?id=7', 'http://www.filedropper.com/lister.php?id=8', 'http://www.filedropper.com/lister.php?id=9', 'http://www.filedropper.com/lister.php?id=a', 'http://www.filedropper.com/lister.php?id=b', 'http://www.filedropper.com/lister.php?id=c', 'http://www.filedropper.com/lister.php?id=d', 'http://www.filedropper.com/lister.php?id=e', 'http://www.filedropper.com/lister.php?id=f', 'http://www.filedropper.com/lister.php?id=g', 'http://www.filedropper.com/lister.php?id=h', 'http://www.filedropper.com/lister.php?id=i', 'http://www.filedropper.com/lister.php?id=j', 'http://www.filedropper.com/lister.php?id=k', 'http://www.filedropper.com/lister.php?id=l', 'http://www.filedropper.com/lister.php?id=m', 'http://www.filedropper.com/lister.php?id=n', 'http://www.filedropper.com/lister.php?id=o', 'http://www.filedropper.com/lister.php?id=p', 'http://www.filedropper.com/lister.php?id=q', 'http://www.filedropper.com/lister.php?id=r', 'http://www.filedropper.com/lister.php?id=s', 'http://www.filedropper.com/lister.php?id=t', 'http://www.filedropper.com/lister.php?id=u', 'http://www.filedropper.com/lister.php?id=v', 'http://www.filedropper.com/lister.php?id=w', 'http://www.filedropper.com/lister.php?id=x', 'http://www.filedropper.com/lister.php?id=y', 'http://www.filedropper.com/lister.php?id=z']
11
12 futures = [loop.run_in_executor(executor, requests.get, url) for url in urls]
13 await asyncio.wait(futures)
14
15 for future in futures:
16 soup = BS(future.result().content)
17 for all_links in soup.find_all('a', href=True):
18 print("URL:", all_links['href'])
19 with open('filedropper_com.txt', 'a') as f:
20 f.write(all_links['href'] + '\n')
21
22loop.run_until_complete(make_requests())
23
1from requests import get
2from requests.exceptions import RequestException
3from contextlib import closing
4from bs4 import BeautifulSoup
5