# Imports
import os
import requests
import string
import pandas as pd
from bs4 import BeautifulSoup
from glob import glob
BeautifulSoup Demo: Scraping Counter-Currents
Note: This has been edited to scrape the website in small batches. You can’t run this freely without making changes.
Some Background on this site…
I work with extremist language, whether to detect hate speech or examining language from the far-right (and sometimes the far-left).
I was working on a project related to Counter-Currents – a site that “promote(s) white identity politics”. So if you’re wondering why this is the example, I wanted to make it clear that this is a part of my work and I would never suggest someone read this type of language without ample preparation.
Before you begin:
- I’ll likely edit this to scrape a site that is less… abrasive. But for now it was the code I had previously used.
- PLEASE change this demo to a site that makes sense for you!
- I respect your mental health and well-being.
### Call the Main page of interest - Counter Currents
= 'https://counter-currents.com'
mainpageURL = requests.get(mainpageURL)
page
## Check Page Pull Success
def PageSuccess(page):
if page.status_code == 200:
print('Success!')
else:
print("Page error occured.")
PageSuccess(page)
## Run Beautiful Soup on Main page
= BeautifulSoup(page.content, 'html.parser')
soup
## Pulling URLs for archives section into list
= soup.find(id="archives-2")
archives #print(archives.prettify())
= archives.find_all('li') archive_months
## How many months of archives did I pull?
print("How many months worth of archives did I pull?: ", len(archive_months), '\n', "Years: ", len(archive_months)/12)
#display(archive_months)
## For each month in archive, pull link into list and provide details if wanted/needed
= []
archive_url_list
for a_month in archive_months:
= a_month.find('a')['href']
link
archive_url_list.append(link)
print("Does this count match the above month count?: ", len(archive_url_list)==len(archive_months))
## For each article in the archive month list, get url
= []
article_url_list_
for URL in archive_url_list:
= BeautifulSoup(requests.get(URL).content, 'html.parser')
soup2 = soup2.find_all('h2', class_="entry-title")
articles for art in articles:
= art.find('a')['href']
link_a
article_url_list_.append(link_a)
## How many article urls did I pull?
print("How many article urls did I pull?: ", len(article_url_list_), '\n'*2, article_url_list_[:3])
from datetime import date
= str(date.today())
today print("Today's date:", today)
# Save URL list so you don't have to repeat the process
#a_urllist = pd.DataFrame(article_url_list_)
#a_urllist.to_csv(r'urllist' + today + '.csv', index=False)
# Holder/Dictionary
= {}
text_data
# Batch Pulls
for URL in article_url_list_[1999:2025]:
= URL
url = BeautifulSoup(requests.get(URL).content, 'html.parser')
soup3 if soup3.find('h1', class_="entry-title").text != None:
= soup3.find('h1', class_="entry-title").text
title else:
= 'NONE' + str(url)
title = soup3.title.text
title2 = soup3.find('div', class_="entry-content").text
text = soup3.find('span', class_='author vcard').text
author = soup3.find('div', class_="entry-utility")
articledeets = soup3.find('div', class_="entry-utility").find('span', class_='entry-date date updated').text
pubdate = soup3.find('div', class_="entry-utility").find('span', class_='cat-links').text
origpubtag = soup3.find('div', class_="entry-utility").find('span', class_='tag-links')
taglinks = []
tagslist if taglinks != None:
= taglinks.find_all('a')
tags for link in tags:
= link.get_text()
addtag
tagslist.append(addtag)else:
'NONE')
tagslist.append(= {}
text_data[title] 'url'] = url
text_data[title]['author'] = author
text_data[title]['ext_title'] = title2
text_data[title]['content'] = text
text_data[title]['pubdate'] = pubdate
text_data[title]['pubbed_in'] = origpubtag
text_data[title]['other_tags'] = tagslist
text_data[title][
# Batch Pulls
for URL in article_url_list_[2025:2225]:
= URL
url = BeautifulSoup(requests.get(URL).content, 'html.parser')
soup3 if soup3.find('h1', class_="entry-title").text != None:
= soup3.find('h1', class_="entry-title").text
title else:
= 'NONE' + str(url)
title = soup3.title.text
title2 = soup3.find('div', class_="entry-content").text
text = soup3.find('span', class_='author vcard').text
author = soup3.find('div', class_="entry-utility")
articledeets = soup3.find('div', class_="entry-utility").find('span', class_='entry-date date updated').text
pubdate = soup3.find('div', class_="entry-utility").find('span', class_='cat-links').text
origpubtag = soup3.find('div', class_="entry-utility").find('span', class_='tag-links')
taglinks = []
tagslist if taglinks != None:
= taglinks.find_all('a')
tags for link in tags:
= link.get_text()
addtag
tagslist.append(addtag)else:
'NONE')
tagslist.append(= {}
text_data[title] 'url'] = url
text_data[title]['author'] = author
text_data[title]['ext_title'] = title2
text_data[title]['content'] = text
text_data[title]['pubdate'] = pubdate
text_data[title]['pubbed_in'] = origpubtag
text_data[title]['other_tags'] = tagslist text_data[title][
= pd.DataFrame(text_data).T
a_text
#pulled at date save to txt & csv
r'CounterCurrentsDatapulledtt_' + today + '.txt')
a_text.to_csv(r'CounterCurrentsDatapulledtt_' + today + '.csv') a_text.to_csv(
'display.max_rows', None)
pd.set_option(#val = text_data
#display(val)
display(a_text.head())
Home | Return to Twitter Scraping Demo Home Page | << Requests and BeautifulSoup