Breach Lab
by
import pandas as pd import re from collections import Counter import matplotlib.pyplot as plt import numpy as np import squarify # pip install squarify import seaborn as sns from wordcloud import WordCloud import plotly.express as px # Requires: pip install plotly
file_path = ‘/Users/meredithsmith/Desktop/TØPAnalysis/Alltøplyrics.xlsx’ df = pd.read_excel(file_path)
df.head()
import pandas as pd import nltk from nltk.corpus import stopwords from wordcloud import WordCloud import matplotlib.pyplot as plt from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download(‘stopwords’) nltk.download(‘vader_lexicon’)
stop_words = set(stopwords.words(‘english’))
def clean_lyrics(text): # Lowercase and remove non-alphabetic characters words = str(text).lower().split() clean_words = [w for w in words if w.isalpha() and w not in stop_words] return “ “.join(clean_words)
df[‘Clean_Lyrics’] = df[‘Lyrics’].apply(clean_lyrics)
sia = SentimentIntensityAnalyzer() df[‘Sentiment_Score’] = df[‘Lyrics’].apply(lambda x: sia.polarity_scores(str(x))[‘compound’])
album_sentiment = df.groupby(‘album_name’)[‘Sentiment_Score’].mean().sort_values() print(“Average Sentiment by Album:\n”, album_sentiment)
top_album = album_sentiment.idxmax() text = “ “.join(df[df[‘album_name’] == top_album][‘Clean_Lyrics’]) wordcloud = WordCloud(width=800, height=400, background_color=’white’).generate(text)
plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation=’bilinear’) plt.title(f”Most Frequent Words in {top_album}”) plt.axis(‘off’) plt.show()
stop_words = [ “i”, “me”, “my”, “myself”, “we”, “our”, “ours”, “ourselves”, “you”, “your”, “yours”, “he”, “him”, “his”, “she”, “her”, “it”, “its”, “they”, “them”, “their”, “what”, “which”, “who”, “whom”, “this”, “that”, “these”, “those”, “am”, “is”, “are”, “was”, “were”, “be”, “been”, “being”, “have”, “has”, “had”, “having”, “do”, “does”, “did”, “doing”, “a”, “an”, “the”, “and”, “but”, “if”, “or”, “because”, “as”, “until”, “while”, “of”, “at”, “by”, “for”, “with”, “about”, “against”, “between”, “into”, “through”, “during”, “before”, “after”, “above”, “below”, “to”, “from”, “up”, “down”, “in”, “out”, “on”, “off”, “over”, “under”, “again”, “further”, “then”, “once”, “not”, “no”, “yes”, “so”, “can”, “just”, “wanted”, “where”, “dont”, “yeah”, “let”, “im”, “ah”, “want”, “like”, “ive”, “all”, “ooh”, “ill”, “even”, “forforforforforfor”, “every”, “hoohoohoo”, “hell”, “yeahyeahyeahyeah”, “daisychained”, “thats”, “there”, “how”, “somehow”, “when”, “used”, “cant”, “chorus”, “lada” ]
def remove_stopwords(text): words = text.split() filtered_words = [word for word in words if word.lower() not in stop_words] return “ “.join(filtered_words)
df.info
df.shape
df_breach = df[df[“album_name”] == “Breach”][[“album_name”, “track_name”, “Lyrics”]] df_clancy = df[df[“album_name”] == “Clancy”][[“album_name”, “track_name”, “Lyrics”]] df_sai = df[df[“album_name”] == “Scaled And Icy”][[“album_name”, “track_name”, “Lyrics”]] df_trench = df[df[“album_name”] == “Trench”][[“album_name”, “track_name”, “Lyrics”]] df_blurryface = df[df[“album_name”] == “Blurryface”][[“album_name”, “track_name”, “Lyrics”]] df_vessel = df[df[“album_name”] == “Vessel”][[“album_name”, “track_name”, “Lyrics”]] df_self_titled = df[df[“album_name”] == “Twenty One Pilots”][[“album_name”, “track_name”, “Lyrics”]]
print(df_breach.head())
target_albums = [ “Breach”, “Clancy”, “Scaled And Icy”, “Trench”, “Blurryface”, “Vessel”, “Twenty One Pilots” ]
df_combined = df[df[“album_name”].isin(target_albums)][[“album_name”, “track_name”, “Lyrics”]]
print(df_combined[“album_name”].unique()) print(df_combined.shape)
print(df[‘track_name’].unique())
target_albums = [ “Breach”, “Clancy”, “Scaled And Icy”, “Trench”, “Blurryface”, “Vessel”, “Twenty One Pilots” ]
albums = {name: df[df[“album_name”] == name][[“track_name”, “Lyrics”]] for name in target_albums}
print(albums[“Vessel”].head())
import pandas as pd from collections import Counter import nltk from nltk.corpus import stopwords
nltk.download(‘stopwords’) stop_words = set(stopwords.words(‘english’))
stop_words.update([‘yeah’, ‘oh’, ‘ooh’, ‘woah’, ‘la’, ‘na’])
target_albums = [ “Breach”, “Clancy”, “Scaled And Icy”, “Trench”, “Blurryface”, “Vessel”, “Twenty One Pilots” ]
album_themes = {}
for album in target_albums: # Filter for the current album # Note: Use ‘Lyrics’ or ‘lyrics’ depending on your exact column name album_df = df[df[“album_name”] == album]
# Combine all lyrics from this album into one big string
all_text = " ".join(album_df["Lyrics"].astype(str)).lower()
# Clean the text: keep only words, remove stopwords
words = [word for word in all_text.split() if word.isalpha() and word not in stop_words]
# Count the most common words (Themes)
top_words = Counter(words).most_common(10)
# Store the results in our dictionary
album_themes[album] = top_words
for album, themes in album_themes.items(): print(f”\n— Top Themes for {album} —”) for word, count in themes: print(f”{word}: {count}”)
import nltk from nltk.corpus import stopwords import gensim from gensim import corpora import re
album_list = [df_breach, df_clancy, df_sai, df_trench, df_blurryface, df_vessel, df_self_titled] album_names = [“Breach”, “Clancy”, “Scaled And Icy”, “Trench”, “Blurryface”, “Vessel”, “Self Titled”]
nltk.download(‘stopwords’)
stop_words = set(stopwords.words(‘english’))
extra_words = [“i”, “me”, “my”, “myself”, “we”, “our”, “ours”, “ourselves”, “you”, “your”, “yours”, “he”, “him”, “his”, “she”, “her”, “it”, “its”, “they”, “them”, “their”, “what”, “which”, “who”, “whom”, “this”, “that”, “these”, “those”, “am”, “is”, “are”, “was”, “were”, “be”, “been”, “being”, “have”, “has”, “had”, “having”, “do”, “does”, “did”, “doing”, “a”, “an”, “the”, “and”, “but”, “if”, “or”, “because”, “as”, “until”, “while”, “of”, “at”, “by”, “for”, “with”, “about”, “against”, “between”, “into”, “through”, “during”, “before”, “after”, “above”, “below”, “to”, “from”, “up”, “down”, “in”, “out”, “on”, “off”, “over”, “under”, “again”, “further”, “then”, “once”, “not”, “no”, “yes”, “so”, “can”, “just”, “wanted”, “where”, “dont”, “yeah”, “let”, “im”, “ah”, “want”, “like”, “ive”, “all”, “ooh”, “ill”, “even”, “forforforforforfor”, “every”, “hoohoohoo”, “hell”, “get”, “chorus”, “johnny” “yeahyeahyeahyeah”, “daisychained”, “thats”, “there”, “how”, “somehow”, “when”, “used”, “cant”] stop_words.update(extra_words)
print(f”Stopwords prepared. Total count: {len(stop_words)}”)
def preprocess(text): # Clean: remove punctuation, lowercase, and split into words text = re.sub(r’[^\w\s]’, ‘’, str(text).lower()) return [word for word in text.split() if word not in stop_words and len(word) > 2]
all_album_topics = {}
for name, album_df in zip(album_names, album_list): # Process each song individually processed_songs = [preprocess(s) for s in album_df[“Lyrics”].astype(str)]
# Create dictionary from the list of lists
dictionary = corpora.Dictionary(processed_songs)
# Create corpus from the list of lists
corpus = [dictionary.doc2bow(song) for song in processed_songs]
# Train LDA...
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3)
all_album_topics[name] = lda_model.print_topics(-1) # %% import gensim from gensim import corpora
dictionary = corpora.Dictionary(processed_songs)
corpus = [dictionary.doc2bow(text) for text in processed_songs]
lda_model = gensim.models.LdaModel( corpus=corpus, id2word=dictionary, num_topics=3, random_state=100, update_every=1, chunksize=10, # Adjusted for your 14 songs passes=20, # Increased passes to help the model find patterns in a small dataset alpha=’auto’ )
print(“— Discovered Lyrical Themes —”) for idx, topic in lda_model.print_topics(-1): print(f”Topic {idx}: {topic}\n”)
topic_labels = { 0: “Vulnerability & Flight”, 1: “Anxiety & Internal Dialogue”, 2: “Nostalgia & Character Narrative” }
print(“— Final Lyrical Analysis —”) for idx, topic in lda_model.print_topics(-1): label = topic_labels.get(idx, “Unknown Theme”) # Clean the topic string for better readability clean_words = “, “.join([word.split(“*”)[1].replace(‘”’, ‘’) for word in topic.split(“ + “)])
print(f"THEME: {label}")
print(f"Keywords: {clean_words}\n") # %% [markdown] # ### Topic Model # %% import gensim from gensim import corpora
dictionary = corpora.Dictionary(all_processed_words)
corpus = [dictionary.doc2bow(text) for text in all_processed_words]
lda_model = gensim.models.LdaModel( corpus=corpus, id2word=dictionary, num_topics=5, random_state=100, passes=20, # More passes help with small datasets alpha=’auto’, # Let the model learn the topic distribution per_word_topics=True )
print(“— Discovered Lyrical Themes —”) for idx, topic in lda_model.print_topics(-1): print(f”Topic {idx}: {topic}\n”)
faith_words = [‘faith’, ‘believe’, ‘god’, ‘pray’, ‘lord’, ‘creator’, ‘church’, ‘soul’, ‘spirit’, ‘halo’, ‘jesus’, “lion”, “holy”] hope_words = [‘hope’, “love” ‘hopeful’, ‘holding’,’light’, ‘morning’, ‘sun’, ‘sky’, ‘alive’, ‘stay’, ‘future’, ‘hold’, ‘better’, “friends”] word_words = [“flesh”, “den”, “mind”]
def count_keywords(text, lexicon): words = str(text).lower().split() return sum(1 for word in words if word in lexicon)
df_combined[‘Faith_Count’] = df_combined[‘Lyrics’].apply(lambda x: count_keywords(x, faith_words)) df_combined[‘Hope_Count’] = df_combined[‘Lyrics’].apply(lambda x: count_keywords(x, hope_words)) df_combined[‘Word_Count’] = df_combined[‘Lyrics’].apply(lambda x: count_keywords(x, hope_words))
top_faith_songs = df_combined.nlargest(5, ‘Faith_Count’)[[‘album_name’, ‘track_name’, ‘Faith_Count’]] top_hope_songs = df_combined.nlargest(5, ‘Hope_Count’)[[‘album_name’, ‘track_name’, ‘Hope_Count’]] top_word_songs = df_combined.nlargest(5, ‘Word_Count’)[[‘album_name’, ‘track_name’, ‘Word_Count’]] print(“— Top 5 Songs for Faith —”) print(top_faith_songs) print(“\n— Top 5 Songs for Hope —”) print(top_hope_songs) print(“\n— Top 5 Songs for Word —”) print(top_word_songs)
album_trends = df_combined.groupby(‘album_name’)[[‘Faith_Count’, ‘Hope_Count’]].mean()
print(“\n— Theme Density by Album —”) print(album_trends.sort_values(by=’Faith_Count’, ascending=False))
import matplotlib.pyplot as plt
plt.figure(figsize=(10,6)) for i, album in enumerate(album_trends.index): plt.scatter(album_trends.loc[album, ‘Faith_Count’], album_trends.loc[album, ‘Hope_Count’], label=album, s=100) plt.text(album_trends.loc[album, ‘Faith_Count’]+0.05, album_trends.loc[album, ‘Hope_Count’]+0.05, album)
plt.title(“Faith vs. Hope Across Albums”) plt.xlabel(“Average Faith Word Count”) plt.ylabel(“Average Hope Word Count”) plt.grid(True, linestyle=’–’, alpha=0.6) plt.show()
import re
def clean_lyric(text): text = re.sub(r’[^\w\s]’, ‘’, str(text).lower()) return [word for word in text.split() if word not in stop_words and len(word) > 2]
all_processed_songs = [clean_lyric(s) for s in df_combined[“Lyrics”]]
from gensim import corpora import gensim
dictionary = corpora.Dictionary(all_processed_songs)
corpus = [dictionary.doc2bow(text) for text in all_processed_songs]
lda_model = gensim.models.LdaModel( corpus=corpus, id2word=dictionary, num_topics=5, # You can increase this since you have more data now random_state=100, passes=15 )
def get_dominant_topic(ldamodel, corpus): topic_data = [] for i, row in enumerate(ldamodel[corpus]): # Sort topics by probability topics = sorted(row, key=lambda x: x[1], reverse=True) topic_num, prop_topic = topics[0] topic_data.append([int(topic_num), round(prop_topic, 4)]) return pd.DataFrame(topic_data, columns=[‘Dominant_Topic’, ‘Topic_Perc_Contribution’])
df_topics = get_dominant_topic(lda_model, corpus)
df_topics[‘Song_Name’] = df_combined[‘track_name’].values df_topics[‘Album’] = df_combined[‘album_name’].values
df_topics[‘Topic_Label’] = df_topics[‘Dominant_Topic’].map(topic_labels)
all_albums_df = pd.concat([ df_self_titled, df_vessel, df_blurryface, df_trench, df_sai, df_clancy, df_breach ]).reset_index(drop=True)
df_topics = get_dominant_topic(lda_model, corpus)
df_topics[‘Song_Name’] = all_albums_df[‘track_name’] df_topics[‘Album’] = all_albums_df[‘album_name’]
topic_labels = { 0: “Faith & Resilience”, 1: “Emotional Exposure”, 2: “Search for Truth”, 3: “Wasted Time”, 4: “Path & Purpose” } df_topics[‘Topic_Label’] = df_topics[‘Dominant_Topic’].map(topic_labels)
print(df_topics[[‘Album’, ‘Song_Name’, ‘Topic_Label’]].head(20))
print(f”Songs in Master DF: {len(all_albums_df)}”) print(f”Songs processed by LDA: {len(df_topics)}”)
empty_songs = [i for i, text in enumerate(all_processed_words) if len(text) == 0] if empty_songs: print(f”Warning: Songs at indices {empty_songs} are empty after preprocessing.”)
#
#
#
#
df_topics[‘Topic_Label’] = df_topics[‘Dominant_Topic’].map(topic_labels)
print(df_topics[[‘Song_Name’, ‘Topic_Label’, ‘Topic_Perc_Contribution’]])
import matplotlib.pyplot as plt
df_topics[‘Topic_Label’] = df_topics[‘Dominant_Topic’].map(topic_labels)
ordered_labels = [topic_labels[i] for i in range(5)] topic_counts = df_topics[‘Topic_Label’].value_counts().reindex(ordered_labels).fillna(0)
plt.figure(figsize=(12, 6)) topic_counts.plot(kind=’bar’, color=’maroon’, edgecolor=’black’)
plt.title(‘Frequency of Lyrical Themes in “Breach”’, fontsize=14) plt.xlabel(‘Lyrical Theme’, fontsize=12) plt.ylabel(‘Number of Songs’, fontsize=12)
plt.xticks(rotation=45, ha=’right’) plt.tight_layout() plt.show()
import matplotlib.pyplot as plt
topic_counts = df_topics[‘Dominant_Topic’].value_counts().sort_index()
plt.figure(figsize=(10,6)) topic_counts.plot(kind=’bar’, color=’maroon’) plt.title(‘Number of Songs per Lyrical Theme’) plt.xlabel(‘Topic ID’) plt.ylabel(‘Number of Songs’) plt.xticks(rotation=0) plt.show()
album_summary = df_topics.groupby([‘Album’, ‘Topic_Label’]).size().unstack(fill_value=0)
album_pct = album_summary.div(album_summary.sum(axis=1), axis=0) * 100 print(album_pct)
release_dates = { “Twenty One Pilots”: 2009, “Regional at Best”: 2011, “Vessel”: 2013, “Blurryface”: 2015, “Trench”: 2018, “Scaled And Icy”: 2021, “Clancy”: 2024, “Breach”: 2025 # Adjust based on your specific dataset context }
df_combined[‘Release_Year’] = df_combined[‘album_name’].map(release_dates)
df_combined = df_combined.sort_values(by=[‘Release_Year’, ‘album_name’]).reset_index(drop=True)
import matplotlib.pyplot as plt import seaborn as sns
ordered_albums = [a for a in release_dates.keys() if a in df_topics[‘Album’].unique()]
theme_dist = df_topics.groupby([‘Album’, ‘Topic_Label’]).size().unstack(fill_value=0) theme_dist = theme_dist.reindex(ordered_albums) # This forces the chronological order
theme_dist.plot(kind=’bar’, stacked=True, figsize=(12, 7), colormap = ‘YlOrRd’)
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt
def calculate_diversity(text): words = str(text).lower().split() if len(words) == 0: return 0 return len(set(words)) / len(words)
df_combined[‘Lexical_Diversity’] = df_combined[‘Lyrics’].apply(calculate_diversity)
diversity_scores = df_combined.groupby(‘album_name’)[‘Lexical_Diversity’].mean().reset_index() diversity_scores[‘Release_Year’] = diversity_scores[‘album_name’].map(release_dates)
diversity_scores = diversity_scores.sort_values(‘Release_Year’)
warm_colors = [‘#4A0E0E’, ‘#7B0828’, ‘#9B1B30’, ‘#B22222’, ‘#C0392B’, ‘#D72638’, ‘#E74C3C’]
plt.figure(figsize=(10, 6)) sns.barplot( data=diversity_scores, x=’Lexical_Diversity’, y=’album_name’, hue=’album_name’, palette=warm_colors, legend=False # Removes the redundant legend )
plt.title(‘Vocabulary Richness Over Time (Chronological)’, fontsize=16) plt.xlabel(‘Unique Words / Total Words Ratio’, fontsize=12) plt.ylabel(‘Album (Oldest to Newest)’, fontsize=12) plt.grid(axis=’x’, linestyle=’–’, alpha=0.7)
plt.show()
from textblob import TextBlob
df_combined[‘Polarity’] = df_combined[‘Lyrics’].apply(lambda x: TextBlob(str(x)).sentiment.polarity) df_combined[‘Subjectivity’] = df_combined[‘Lyrics’].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
plt.figure(figsize=(12, 8)) sns.scatterplot(data=df_combined, x=’Polarity’, y=’Subjectivity’, hue=’album_name’, s=100, alpha=0.7)
plt.axvline(0, color=’red’, linestyle=’–’, alpha=0.5) plt.title(‘Song Emotional Map: Polarity vs. Subjectivity’, fontsize=16) plt.xlabel(‘Emotional Tone (Negative <—> Positive)’, fontsize=12) plt.ylabel(‘Personal/Subjective Intensity’, fontsize=12) plt.legend(bbox_to_anchor=(1.05, 1), loc=’upper left’) plt.show()
import pandas as pd from wordcloud import WordCloud import matplotlib.pyplot as plt from nltk.sentiment.vader import SentimentIntensityAnalyzer import nltk
nltk.download(‘vader_lexicon’) sia = SentimentIntensityAnalyzer()
df_combined[‘Sentiment’] = df_combined[‘Lyrics’].apply(lambda x: sia.polarity_scores(str(x))[‘compound’]) album_sentiments = df_combined.groupby(‘album_name’)[‘Sentiment’].mean()
ordered_albums = [a for a in release_dates.keys() if a in album_sentiments.index]
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(20, 10)) axes = axes.flatten()
for i, album in enumerate(ordered_albums): # Filter lyrics for this album text = “ “.join(df_combined[df_combined[‘album_name’] == album][‘Lyrics’]) score = album_sentiments[album]
# Determine color scheme based on sentiment
# Positive (>0.05) = Greens, Negative (< -0.05) = Reds, Neutral = Blues
if score > 0.05:
color_map = 'Reds'
mood = "Positive"
elif score < -0.05:
color_map = 'Blues'
mood = "Negative/Intense"
else:
color_map = 'coolwarm'
mood = "Neutral"
# Generate Word Cloud
wc = WordCloud(
colormap=color_map,
max_words=50,
stopwords=stop_words
).generate(text)
# Plotting
axes[i].imshow(wc, interpolation='bilinear')
axes[i].set_title(f"{album}\nScore: {score:.2f} ({mood})", fontsize=14)
axes[i].axis('off')
for j in range(i + 1, len(axes)): axes[j].axis(‘off’)
plt.tight_layout() plt.show()

import networkx as nx import matplotlib.pyplot as plt
lore_keywords = {‘dema’, ‘bishop’, ‘clancy’, ‘torch’, ‘neon’, ‘vulture’, ‘rebel’, ‘bandito’, ‘yellow’}
G = nx.Graph()
for i, song_a in df_combined.iterrows(): # Add the song as a node G.add_node(song_a[‘track_name’], album=song_a[‘album_name’])
# Check for lore words in song_a
words_a = set(str(song_a['Lyrics']).lower().split())
shared_lore_a = words_a.intersection(lore_keywords)
# Compare with every other song to find connections
if shared_lore_a:
for j, song_b in df_combined.iloc[i+1:].iterrows():
words_b = set(str(song_b['Lyrics']).lower().split())
shared_words = shared_lore_a.intersection(words_b)
# If they share at least one lore word, create a connection
if shared_words:
G.add_edge(song_a['track_name'], song_b['track_name'], weight=len(shared_words))
plt.figure(figsize=(15, 12)) pos = nx.spring_layout(G, k=0.15) # k controls the distance between nodes
nx.draw(G, pos, with_labels=True, node_size=50, font_size=8, alpha=0.7, edge_color=’gray’) plt.title(“The Twenty One Pilots ‘Lore’ Network”) plt.show()
from pyvis.network import Network
net = Network(height=”750px”, width=”100%”, bgcolor=”#222222”, font_color=”white”, notebook=True)
net.from_nx(G) net.show(“lore_network.html”)
import pandas as pd import pronouncing import seaborn as sns import matplotlib.pyplot as plt
def calculate_rhyme_score(text): # Clean text into tokens words = [w.lower().strip(“.,!?"”) for w in str(text).split() if w.isalpha()] if not words: return 0
rhyme_count = 0
# We check each word against the next 10 words to find internal/near rhymes
# (Typical of dense rap schemes like Levitate)
for i in range(len(words)):
current_word = words[i]
# Get the phonetic rhyme parts for the current word
current_rhymes = pronouncing.rhymes(current_word)
# Look ahead at the next few words for a rhyme match
look_ahead = words[i+1 : i+11]
for future_word in look_ahead:
if future_word in current_rhymes:
rhyme_count += 1
break # Count the word once if it rhymes with something nearby
# Score = Rhyming words / Total words
return rhyme_count / len(words)
def syllable_count(word): phones = pronouncing.phones_for_word(word) if phones: return pronouncing.syllable_count(phones[0]) return 0
df_combined[‘Rhyme_Density’] = df_combined[‘Lyrics’].apply(calculate_rhyme_score)
album_rhyme_stats = df_combined.groupby(‘album_name’)[‘Rhyme_Density’].mean().reset_index() album_rhyme_stats[‘Year’] = album_rhyme_stats[‘album_name’].map(release_dates) album_rhyme_stats = album_rhyme_stats.sort_values(‘Year’)
plt.figure(figsize=(12, 6)) sns.lineplot(data=album_rhyme_stats, x=’album_name’, y=’Rhyme_Density’, marker=’o’, color=’#E21F26’, linewidth=2.5)
plt.title(‘Technical Complexity: Rhyme Density Over Time’, fontsize=16) plt.ylabel(‘Rhyme Density Score (Rhymes/Total Words)’) plt.xlabel(‘Album Chronology’) plt.xticks(rotation=45) plt.grid(axis=’y’, linestyle=’–’, alpha=0.4) plt.show()
import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import zlib
def calculate_repetition_score(text): if not text or len(str(text)) < 10: return 0
# Encode text to bytes
encoded_text = str(text).lower().encode('utf-8')
# Calculate compressed size vs original size
original_size = len(encoded_text)
compressed_size = len(zlib.compress(encoded_text))
# A higher score means more of the song was "redundant" (repeated)
repetition_score = 1 - (compressed_size / original_size)
return repetition_score
df_combined[‘Repetition_Score’] = df_combined[‘Lyrics’].apply(calculate_repetition_score)
album_structure = df_combined.groupby(‘album_name’)[‘Repetition_Score’].mean().reset_index() album_structure[‘Year’] = album_structure[‘album_name’].map(release_dates) album_structure = album_structure.sort_values(‘Year’)
plt.figure(figsize=(12, 6))
sns.barplot(data=album_structure, x=’Repetition_Score’, y=’album_name’, palette=’magma’, hue=’album_name’, legend=False)
plt.title(‘Song Structure: Experimentalism vs. Formulaic Patterns’, fontsize=16) plt.xlabel(‘Repetition Score (Higher = More Repeated Choruses/Hooks)’, fontsize=12) plt.ylabel(‘Album’, fontsize=12) plt.grid(axis=’x’, linestyle=’–’, alpha=0.5) plt.show()
import numpy as np import matplotlib.pyplot as plt
song_title = “Stressed Out” # Replace with any track_name from your df_92 lyrics = df_combined[df_combined[‘track_name’] == song_title][‘Lyrics’].values[0]
lines = [line.strip().lower() for line in lyrics.split(‘\n’) if line.strip()] n = len(lines)
matrix = np.zeros((n, n))
for i in range(n): for j in range(n): # If line i is the same as line j, mark it with a 1 if lines[i] == lines[j]: matrix[i, j] = 1
plt.figure(figsize=(8, 8)) plt.imshow(matrix, cmap=’binary’, interpolation=’none’)
plt.title(f’Structural Fingerprint: {song_title}’, fontsize=15) plt.xlabel(‘Line Number’) plt.ylabel(‘Line Number’)
plt.grid(False) plt.show()
master_report = df_combined.groupby(‘album_name’).agg({ ‘Sentiment’: ‘mean’, ‘Lexical_Diversity’: ‘mean’, ‘Rhyme_Density’: ‘mean’, ‘Repetition_Score’: ‘mean’ }).reset_index()
master_report[‘Year’] = master_report[‘album_name’].map(release_dates) master_report = master_report.sort_values(‘Year’)
print(“— FINAL DISCOGRAPHY ANALYSIS —”) print(master_report)
from matplotlib.backends.backend_pdf import PdfPages import matplotlib.pyplot as plt import seaborn as sns
with PdfPages(‘T0P_Data_Analysis_Report.pdf’) as pdf:
# --- PAGE 1: The Master Metrics Table ---
fig, ax = plt.subplots(figsize=(12, 4))
ax.axis('tight')
ax.axis('off')
# Use the master_report dataframe we created earlier
table = ax.table(cellText=master_report.values, colLabels=master_report.columns, loc='center', cellLoc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
plt.title("Discography Master Metrics Overview", fontsize=16, pad=20)
pdf.savefig() # Saves the current figure into the PDF
plt.close()
# --- PAGE 2: Sentiment vs Lexical Diversity ---
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_combined, x='Sentiment', y='Lexical_Diversity', hue='album_name', s=100)
plt.title("Emotional Tone vs. Vocabulary Richness")
pdf.savefig()
plt.close()
# --- PAGE 3: Thematic Evolution (Stacked Bar) ---
theme_dist_pct.plot(kind='bar', stacked=True, figsize=(12, 7))
plt.title("Thematic Evolution Over Time")
plt.legend(bbox_to_anchor=(1.05, 1))
pdf.savefig(bbox_inches='tight')
plt.close()
# --- PAGE 4: Rhyme Density Trend ---
plt.figure(figsize=(10, 6))
sns.lineplot(data=album_rhyme_stats, x='album_name', y='Rhyme_Density', marker='o')
plt.title("Technical Rap Complexity Over Time")
plt.xticks(rotation=45)
pdf.savefig(bbox_inches='tight')
plt.close()
print(“Success! Your full report has been saved as ‘T0P_Data_Analysis_Report.pdf’”)
with PdfPages(‘T0P_Data_Analysis_Report.pdf’) as pdf: # — NEW PAGE 0: The Executive Summary — plt.figure(figsize=(11, 8.5)) # Standard Letter size plt.axis(‘off’)
summary_text = (
"SUMMARY OF FINDINGS\n\n"
"This analysis of 92 songs reveals a band in a state of constant technical transition.\n"
"Early eras focus on Lexical Diversity and Faith themes, while later eras like 'Trench'\n"
"show a spike in Rhyme Density and Lore-building.\n\n"
"Key Takeaways:\n"
"1. Technical Growth: Rhyme complexity has increased by X% since 2009.\n"
"2. Structural Shift: 'Scaled And Icy' represents the most formulaic peak.\n"
"3. Thematic Legacy: 'Lore' keywords now connect over 40% of the discography."
)
plt.text(0.1, 0.9, summary_text, fontsize=12, verticalalignment='top', family='serif')
plt.title("Twenty One Pilots: A Data-Driven Discography Review", fontsize=16, fontweight='bold')
pdf.savefig()
plt.close()
# ... (Include all your other plot pages here) # %% import seaborn as sns import matplotlib.pyplot as plt
heatmap_data = df_topics.groupby([‘Album’, ‘Topic_Label’]).size().unstack(fill_value=0)
ordered_albums = [a for a in release_dates.keys() if a in heatmap_data.index] heatmap_data = heatmap_data.reindex(ordered_albums)
plt.figure(figsize=(12, 8)) sns.heatmap( heatmap_data, annot=True, # Shows the actual song counts in the boxes fmt=”d”, # Formats counts as integers cmap=”YlOrRd”, # A professional Yellow-Green-Blue gradient cbar_kws={‘label’: ‘Number of Songs’} )
plt.title(“Thematic Fingerprint of Twenty One Pilots”, fontsize=18, pad=20) plt.xlabel(“Lyrical Themes (LDA Identified)”, fontsize=12) plt.ylabel(“Album (Chronological Order)”, fontsize=12) plt.xticks(rotation=45) plt.tight_layout()
plt.savefig(“Report_Cover_Heatmap.png”, dpi=300) plt.show()
tags: