Slicing the movie streaming data into genres to visualise their rating in each country
Data: https://uranaisearchastrology.wordpress.com/wp-content/uploads/2025/08/streaming_movies.xlsx * Please change it to the .CSV file format after downloading it! *
Please click "Read more »" below:
# Importing necessary liberaries
import pycountry
import math
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import seaborn as sns
import pandas as pd
import plotly.express as px
print('Original Source: https://www.kaggle.com/datasets/ruchi798/movies-on-netflix-prime-video-hulu-and-disney')
# # Importing csv data (in Anaconda)
# movie=pd.read_csv('streaming_movies.csv')
# # Importing csv data (in Google Collab) at the first time
# from google.colab import files
# uploaded = files.upload() # this will prompt you to select a file
# import io
# import pandas as pd
# movie = pd.read_csv(io.BytesIO(uploaded['streaming_movies.csv']))
# Importing csv data (in Google Collab) from the second time onward
filename = list(uploaded.keys())[0] # picks the first uploaded file
movie = pd.read_csv(io.BytesIO(uploaded[filename]))
display(movie.tail()) # Display the last 5 rows of data
# Normalise IMDb and Rotten Tomatoes scores
movie['IMDb'] = movie['IMDb']/10
movie['Rotten Tomatoes']= movie[r'Rotten Tomatoes'].str.rstrip('%').astype(float)/100
# Making average of these two rating systems
movie['Rating'] = movie[['IMDb', 'Rotten Tomatoes']].mean(axis=1)
Rating=movie['Rating']
movie.drop(labels=['Rating'], axis=1,inplace = True)
movie.drop(labels=['IMDb'], axis=1,inplace = True)
movie.drop(labels=['Rotten Tomatoes'], axis=1,inplace = True)
movie.insert(5, 'Rating', Rating)
# Display the last 5 rows of data
display(movie.tail())
# Separating multiple Genres in different columns
Genres=movie['Genres'].str.split(",",expand=True,)
G_list=[];
for i in range(len(Genres.columns)):
Ind="".join(("G",str(i+1)));G_list.append(Ind)
movie[Ind]=Genres.iloc[:, i]
# Separating multiple countries in different columns
Countries=movie['Country'].str.split(",",expand=True,)
C_list=[];
NoC=(len(Countries.columns))
# Find the country names and their Alpha3 codes
countries = {}
for country in pycountry.countries:
countries[country.name] = country.alpha_3
# Get countries alpha3
for i in range(NoC):
Ind="".join(("C",str(i+1))); C_list.append(Ind)
movie[Ind]=Countries.iloc[:, i]
movie[Ind]=[countries.get(country, '') for country in movie[Ind]]
# Creating a function for the output map
def MapOutPut(Data,GenreSelected,C_list,YearStart,YearEnd):
# Reshape location columns into one
Data_locs = Data.melt(
id_vars=['Year', 'Rating'],
value_vars=C_list,
var_name='LocType',
value_name='Location'
)
# Drop missing locations
Data_locs = Data_locs.dropna(subset=['Location'])
Data_locs_Years = Data_locs[Data_locs['Year'].between(YearStart,YearEnd)]
# # Take average instead of the aggregate
# Data_locs_Years = Data_locs_Years.groupby('Location', as_index=False)['Rating'].mean()
fig = px.choropleth(
Data_locs_Years,
locations='Location',
color='Rating',
title=(f"{GS} Movie Rating by Country ({YearStart}–{YearEnd})")
)
fig.show()
GS='Animation' #GenreSelection
movie_GenreSelected=movie[movie[G_list].eq(GS).any(axis=1)]
# Iteration to compare graphs in different decade groups
MinYear=min(movie_GenreSelected['Year'])
MaxYear=max(movie_GenreSelected['Year'])
Step=5;Interval=10;
for i in range(math.ceil((MaxYear-MinYear)/Step)):
YearEd=MaxYear-Step*i
YearSt=(max(MinYear,YearEd-Interval))
MapOutPut(movie_GenreSelected,GS,C_list,YearSt,YearEd)
No comments:
Post a Comment