by Grayson Adkins, updated August 15, 2024
In this notebook, I analyze former President Donald Trump's campagin rally speeches using natural language processing (NLP) tools NLTK
and spaCy
and perform sentiment analysis using a BERT model.
!pip install -q bs4
I'm using speech transcripts available at rev.com, but they could just as easily be transcribed from audio files (See my notebook Call Summarization Pipeline). BeautifulSoup is used from scraping the transcripts.
import requests
from bs4 import BeautifulSoup
def scrape_website_text(url):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise an exception for HTTP errors
soup = BeautifulSoup(response.content, 'html.parser')
text = soup.get_text(separator=' ', strip=True)
return text
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return None
# Example usage
url = "https://www.rev.com/blog/transcripts/trump-and-vance-speak-at-atlanta-rally"
website_text = scrape_website_text(url)
print(website_text)
import re
from bs4 import BeautifulSoup
import re
from bs4 import BeautifulSoup
def extract_speaker_speech(html_content, speaker_name):
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Extract the text from the transcript
transcript_text = soup.get_text(separator=' ')
# Refine the regex to precisely match "Donald Trump" while excluding "Donald Trump, Jr."
speaker_pattern = fr'\b{re.escape(speaker_name)}\b\s*(?!, Jr)\s*\(\s*\d{{2}}:\d{{2}}:\d{{2}}\s*\):\s*(.*?)(?=\s*\b[A-Za-z]+\s*[A-Za-z]*\s*\(\s*\d{{2}}:\d{{2}}:\d{{2}}\s*\)|$)'
speaker_lines = re.findall(speaker_pattern, transcript_text, re.DOTALL)
# Join the extracted lines into a single string
speaker_speech = ' '.join([line.strip() for line in speaker_lines])
# Clean up any leftover timestamps or speaker annotations
cleaned_speech = re.sub(r'\(\s*\d{2}:\d{2}:\d{2}\s*\)', '', speaker_speech)
return cleaned_speech.strip()
# Specify the speaker's name you want to extract
speaker_name = "Donald Trump"
speaker_speech = extract_speaker_speech(website_text, speaker_name)
print(speaker_speech)
with open("speaker_text.txt", "w") as file:
file.write(speaker_speech)
First, let's clean up the speech by applying some filters. Below, we count the frequency that each word appears in the speech and filter out stop words (i.e. low-value words) such as 'get', 'again', 'tell', 'even', 'could', 'would', 'like', etc. We'll also apply lemmitization to identify all variations of a given root word.
!pip install -q nltk
import nltk
from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# Download the required NLTK data
nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()
# Manually extend the stop words list with additional irrelevant words
additional_stop_words = {'get', 'again', 'tell', 'even', 'could', 'would', 'like', 'much', 'many', 'us', 'see', 'make', 'just', 'say', 'says', 'said', 'go', 'also', 'still', 'take', 'use', 'thank', 'their', 'it\'s', 'right', 'that', 'really', 'don\'t', 'He\'s', 'said', 'didn\'t', 'didn\' t', 'come', 'going', 'know', 'you', 'said,', 'they\' re', 'they \' re', 'want', 'don\' t', 'don \' t', 'he\' s', 'she\' s', 'that\' s', 'it\' s', 'it \' s'}
stop_words.update(additional_stop_words)
# Tokenize the text, remove stop words, and filter by word length (e.g., 4 characters or more)
filtered_words = [word for word in speaker_speech.split() if word.lower() not in stop_words and len(word) >= 4]
# # Tokenize, remove stop words, filter by word length, and apply lemmatization
# filtered_words = [
# lemmatizer.lemmatize(word.lower())
# for word in speaker_speech.split()
# if word.lower() not in stop_words and len(word) >= 4
# ]
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date!
# Count the filtered words
filtered_word_counts = Counter(filtered_words)
# Generate a word cloud
wordcloud = WordCloud(width=800, height=500, background_color='white').generate_from_frequencies(filtered_word_counts)
# Save the word cloud as a PNG file
wordcloud.to_file("word_cloud.png")
# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
# from collections import Counter
# import re
# def count_words(text):
# # Convert text to lowercase to make the count case-insensitive
# text = text.lower()
# # Use regex to find all words (alphanumeric sequences)
# words = re.findall(r'\b\w+\b', text)
# # Use Counter to count occurrences of each word
# word_count = Counter(words)
# return dict(word_count)
# # Count words
# word_counts = count_words(speaker_speech)
# print(word_counts)
import matplotlib.pyplot as plt
from collections import Counter
# Count the words using Counter
word_counts = Counter(speaker_speech.split())
def plot_word_frequencies(word_counts, top_n=20):
# Get the most common words and their counts
most_common_words = word_counts.most_common(top_n)
# Separate the words and their counts for plotting
words, counts = zip(*most_common_words)
# Plot the histogram
plt.figure(figsize=(10, 6))
plt.barh(words, counts, color='skyblue')
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.title(f'Top {top_n} Most Frequent Words')
plt.gca().invert_yaxis() # Invert y-axis to have the most frequent word at the top
plt.show()
# Plot the word frequencies
plot_word_frequencies(word_counts)
import pandas as pd
from collections import Counter
def save_word_counts_to_csv(word_counts, filename="word_counts.csv"):
# Convert the Counter object to a DataFrame
word_counts_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Count'])
# Sort the DataFrame by count in descending order
word_counts_df = word_counts_df.sort_values(by='Count', ascending=False)
# Save the DataFrame to a CSV file
word_counts_df.to_csv(filename, index=False)
print(f"Word counts have been saved to {filename}")
# Save word count to file
save_word_counts_to_csv(word_counts)
Word counts have been saved to word_counts.csv
!pip install wordcloud
from wordcloud import WordCloud
# Count the words using Counter
word_counts = Counter(speaker_speech.split())
# Create a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)
# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off') # Remove axes
plt.show()
!pip install -q transformers torch
import csv
from transformers import pipeline
def analyze_sentence_sentiment(text, output_filename="sentence_sentiment_analysis_results.csv"):
# Load the sentiment analysis pipeline with a BERT-based model
sentiment_analyzer = pipeline("sentiment-analysis")
# Split the text into sentences
sentences = text.split('.')
# Open a CSV file to write the results
with open(output_filename, mode="w", newline="") as file:
writer = csv.writer(file)
# Write the header
writer.writerow(["Sentence", "Sentiment", "Score"])
# Analyze sentiment for each sentence and write the results to the CSV
for sentence in sentences:
if sentence.strip(): # Check if the sentence is not empty
result = sentiment_analyzer(sentence.strip())
sentiment = result[0]['label']
score = result[0]['score']
# Write the sentence, sentiment, and score to the CSV file
writer.writerow([sentence.strip(), sentiment, score])
print(f"Sentiment analysis results have been saved to '{output_filename}'")
# Call the function
analyze_sentence_sentiment(speaker_speech)
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english). Using a pipeline without specifying a model name and revision in production is not recommended.
Sentiment analysis results have been saved to 'sentiment_analysis_results.csv'
# # Analyze sentiment for each sentence using the BERT model
# for sentence in sentences:
# if sentence.strip(): # Check if the sentence is not empty
# result = sentiment_analyzer(sentence.strip())
# sentiment = result[0]['label']
# score = result[0]['score']
# print(f"Sentence: {sentence.strip()}\nSentiment: {sentiment}, Score: {score:.4f}\n")
!pip install -q spacy
!python -m spacy download en_core_web_sm
import spacy
import csv
from transformers import pipeline
def extract_entities_with_sentiment(text, entity_sentiment_file="entity_sentiments.csv"):
# Load the spaCy model for English
nlp = spacy.load("en_core_web_sm")
# Load the sentiment analysis pipeline with a BERT-based model
sentiment_analyzer = pipeline("sentiment-analysis")
# Process the text with spaCy
doc = nlp(text)
# Entity-Level Sentiment Analysis
with open(entity_sentiment_file, mode="w", newline="") as entity_file:
entity_writer = csv.writer(entity_file)
# Write the header without the "Sentence" column
entity_writer.writerow(["Entity", "Label", "Sentiment", "Score"])
for ent in doc.ents:
# Extract the sentence containing the entity
sentence = ent.sent.text
# Perform sentiment analysis on the sentence
sentiment_result = sentiment_analyzer(sentence)
sentiment = sentiment_result[0]['label']
score = sentiment_result[0]['score']
# Write the entity, its label, and the sentiment to the CSV file (without the sentence)
entity_writer.writerow([ent.text, ent.label_, sentiment, score])
print(f"Entity-level sentiments have been saved to '{entity_sentiment_file}'")
# Call the function
extract_entities_with_sentiment(speaker_speech)
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english). Using a pipeline without specifying a model name and revision in production is not recommended.
Entity-level sentiments have been saved to 'entity_sentiments.csv'