!pip install -q bs4


import requests
from bs4 import BeautifulSoup

def scrape_website_text(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for HTTP errors

        soup = BeautifulSoup(response.content, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)

        return text
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return None

# Example usage
url = "https://www.rev.com/blog/transcripts/trump-and-vance-speak-at-atlanta-rally"
website_text = scrape_website_text(url)
print(website_text)


import re
from bs4 import BeautifulSoup

import re
from bs4 import BeautifulSoup

def extract_speaker_speech(html_content, speaker_name):
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract the text from the transcript
    transcript_text = soup.get_text(separator=' ')

    # Refine the regex to precisely match "Donald Trump" while excluding "Donald Trump, Jr."
    speaker_pattern = fr'\b{re.escape(speaker_name)}\b\s*(?!, Jr)\s*\(\s*\d{{2}}:\d{{2}}:\d{{2}}\s*\):\s*(.*?)(?=\s*\b[A-Za-z]+\s*[A-Za-z]*\s*\(\s*\d{{2}}:\d{{2}}:\d{{2}}\s*\)|$)'
    speaker_lines = re.findall(speaker_pattern, transcript_text, re.DOTALL)

    # Join the extracted lines into a single string
    speaker_speech = ' '.join([line.strip() for line in speaker_lines])

    # Clean up any leftover timestamps or speaker annotations
    cleaned_speech = re.sub(r'\(\s*\d{2}:\d{2}:\d{2}\s*\)', '', speaker_speech)

    return cleaned_speech.strip()

# Specify the speaker's name you want to extract
speaker_name = "Donald Trump"
speaker_speech = extract_speaker_speech(website_text, speaker_name)
print(speaker_speech)


with open("speaker_text.txt", "w") as file:
    file.write(speaker_speech)


!pip install -q nltk


import nltk
from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Download the required NLTK data
nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()

# Manually extend the stop words list with additional irrelevant words
additional_stop_words = {'get', 'again', 'tell', 'even', 'could', 'would', 'like', 'much', 'many', 'us', 'see', 'make', 'just', 'say', 'says', 'said', 'go', 'also', 'still', 'take', 'use', 'thank', 'their', 'it\'s', 'right', 'that', 'really', 'don\'t', 'He\'s', 'said', 'didn\'t', 'didn\' t', 'come', 'going', 'know', 'you', 'said,', 'they\' re', 'they \' re', 'want', 'don\' t', 'don \' t', 'he\' s', 'she\' s', 'that\' s', 'it\' s', 'it \' s'}
stop_words.update(additional_stop_words)

# Tokenize the text, remove stop words, and filter by word length (e.g., 4 characters or more)
filtered_words = [word for word in speaker_speech.split() if word.lower() not in stop_words and len(word) >= 4]

# # Tokenize, remove stop words, filter by word length, and apply lemmatization
# filtered_words = [
#     lemmatizer.lemmatize(word.lower())
#     for word in speaker_speech.split()
#     if word.lower() not in stop_words and len(word) >= 4
# ]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Count the filtered words
filtered_word_counts = Counter(filtered_words)

# Generate a word cloud
wordcloud = WordCloud(width=800, height=500, background_color='white').generate_from_frequencies(filtered_word_counts)

# Save the word cloud as a PNG file
wordcloud.to_file("word_cloud.png")

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


# from collections import Counter
# import re

# def count_words(text):
#     # Convert text to lowercase to make the count case-insensitive
#     text = text.lower()

#     # Use regex to find all words (alphanumeric sequences)
#     words = re.findall(r'\b\w+\b', text)

#     # Use Counter to count occurrences of each word
#     word_count = Counter(words)

#     return dict(word_count)

# # Count words
# word_counts = count_words(speaker_speech)
# print(word_counts)


import matplotlib.pyplot as plt
from collections import Counter

# Count the words using Counter
word_counts = Counter(speaker_speech.split())

def plot_word_frequencies(word_counts, top_n=20):
    # Get the most common words and their counts
    most_common_words = word_counts.most_common(top_n)

    # Separate the words and their counts for plotting
    words, counts = zip(*most_common_words)

    # Plot the histogram
    plt.figure(figsize=(10, 6))
    plt.barh(words, counts, color='skyblue')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.title(f'Top {top_n} Most Frequent Words')
    plt.gca().invert_yaxis()  # Invert y-axis to have the most frequent word at the top
    plt.show()

# Plot the word frequencies
plot_word_frequencies(word_counts)


import pandas as pd
from collections import Counter

def save_word_counts_to_csv(word_counts, filename="word_counts.csv"):
    # Convert the Counter object to a DataFrame
    word_counts_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Count'])

    # Sort the DataFrame by count in descending order
    word_counts_df = word_counts_df.sort_values(by='Count', ascending=False)

    # Save the DataFrame to a CSV file
    word_counts_df.to_csv(filename, index=False)

    print(f"Word counts have been saved to {filename}")

# Save word count to file
save_word_counts_to_csv(word_counts)

Word counts have been saved to word_counts.csv


!pip install wordcloud


from wordcloud import WordCloud

# Count the words using Counter
word_counts = Counter(speaker_speech.split())

# Create a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Remove axes
plt.show()


!pip install -q transformers torch


import csv
from transformers import pipeline

def analyze_sentence_sentiment(text, output_filename="sentence_sentiment_analysis_results.csv"):
    # Load the sentiment analysis pipeline with a BERT-based model
    sentiment_analyzer = pipeline("sentiment-analysis")

    # Split the text into sentences
    sentences = text.split('.')

    # Open a CSV file to write the results
    with open(output_filename, mode="w", newline="") as file:
        writer = csv.writer(file)
        # Write the header
        writer.writerow(["Sentence", "Sentiment", "Score"])

        # Analyze sentiment for each sentence and write the results to the CSV
        for sentence in sentences:
            if sentence.strip():  # Check if the sentence is not empty
                result = sentiment_analyzer(sentence.strip())
                sentiment = result[0]['label']
                score = result[0]['score']
                # Write the sentence, sentiment, and score to the CSV file
                writer.writerow([sentence.strip(), sentiment, score])

    print(f"Sentiment analysis results have been saved to '{output_filename}'")

# Call the function
analyze_sentence_sentiment(speaker_speech)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.

Sentiment analysis results have been saved to 'sentiment_analysis_results.csv'


# # Analyze sentiment for each sentence using the BERT model
# for sentence in sentences:
#     if sentence.strip():  # Check if the sentence is not empty
#         result = sentiment_analyzer(sentence.strip())
#         sentiment = result[0]['label']
#         score = result[0]['score']
#         print(f"Sentence: {sentence.strip()}\nSentiment: {sentiment}, Score: {score:.4f}\n")


!pip install -q spacy
!python -m spacy download en_core_web_sm


import spacy
import csv
from transformers import pipeline

def extract_entities_with_sentiment(text, entity_sentiment_file="entity_sentiments.csv"):
    # Load the spaCy model for English
    nlp = spacy.load("en_core_web_sm")

    # Load the sentiment analysis pipeline with a BERT-based model
    sentiment_analyzer = pipeline("sentiment-analysis")

    # Process the text with spaCy
    doc = nlp(text)

    # Entity-Level Sentiment Analysis
    with open(entity_sentiment_file, mode="w", newline="") as entity_file:
        entity_writer = csv.writer(entity_file)
        # Write the header without the "Sentence" column
        entity_writer.writerow(["Entity", "Label", "Sentiment", "Score"])

        for ent in doc.ents:
            # Extract the sentence containing the entity
            sentence = ent.sent.text
            # Perform sentiment analysis on the sentence
            sentiment_result = sentiment_analyzer(sentence)
            sentiment = sentiment_result[0]['label']
            score = sentiment_result[0]['score']
            # Write the entity, its label, and the sentiment to the CSV file (without the sentence)
            entity_writer.writerow([ent.text, ent.label_, sentiment, score])

    print(f"Entity-level sentiments have been saved to '{entity_sentiment_file}'")

# Call the function
extract_entities_with_sentiment(speaker_speech)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.

Entity-level sentiments have been saved to 'entity_sentiments.csv'

Analyzing Donald Trump's Speeches: NLP, Named-Entity Recognition, and Sentiment Analysis¶

Install dependencies¶

Scrape speeches¶

Pre-Processing: Frequency count by word¶

Sentence-level Sentiment Analysis¶

Named Entity Recognition & Entity Sentitment Analysis¶