Tuesday, August 19, 2025

Slicing Big Data of Movie Streaming with px.choropleth of Seaborn in Python coding

 



 Slicing the movie streaming data into genres to visualise their rating in each country

Data: https://uranaisearchastrology.wordpress.com/wp-content/uploads/2025/08/streaming_movies.xlsx * Please change it to the .CSV file format after downloading it! *

Please click "Read more »" below: 

 

# Importing necessary liberaries
import pycountry
import math
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

import seaborn as sns
import pandas as pd
import plotly.express as px

print('Original Source: https://www.kaggle.com/datasets/ruchi798/movies-on-netflix-prime-video-hulu-and-disney')
# # Importing csv data (in Anaconda)
# movie=pd.read_csv('streaming_movies.csv')

# # Importing csv data (in Google Collab) at the first time
# from google.colab import files
# uploaded = files.upload()  # this will prompt you to select a file
# import io
# import pandas as pd
# movie = pd.read_csv(io.BytesIO(uploaded['streaming_movies.csv']))

# Importing csv data (in Google Collab) from the second time onward
filename = list(uploaded.keys())[0]  # picks the first uploaded file
movie = pd.read_csv(io.BytesIO(uploaded[filename]))

display(movie.tail()) # Display the last 5 rows of data
# Normalise IMDb and Rotten Tomatoes scores
movie['IMDb'] = movie['IMDb']/10
movie['Rotten Tomatoes']= movie[r'Rotten Tomatoes'].str.rstrip('%').astype(float)/100
# Making average of these two rating systems
movie['Rating'] = movie[['IMDb', 'Rotten Tomatoes']].mean(axis=1)
Rating=movie['Rating']
movie.drop(labels=['Rating'], axis=1,inplace = True)
movie.drop(labels=['IMDb'], axis=1,inplace = True)
movie.drop(labels=['Rotten Tomatoes'], axis=1,inplace = True)
movie.insert(5, 'Rating', Rating)  

# Display the last 5 rows of data
display(movie.tail())

# Separating multiple Genres in different columns
Genres=movie['Genres'].str.split(",",expand=True,)
G_list=[];
for i in range(len(Genres.columns)):
    Ind="".join(("G",str(i+1)));G_list.append(Ind)
    movie[Ind]=Genres.iloc[:, i]
   
# Separating multiple countries in different columns
Countries=movie['Country'].str.split(",",expand=True,)
C_list=[];
NoC=(len(Countries.columns))
# Find the country names and their Alpha3 codes
countries = {}
for country in pycountry.countries:
    countries[country.name] = country.alpha_3

# Get countries alpha3
for i in range(NoC):
    Ind="".join(("C",str(i+1))); C_list.append(Ind)
    movie[Ind]=Countries.iloc[:, i]
    movie[Ind]=[countries.get(country, '') for country in movie[Ind]]
   
# Creating a function for the output map
def MapOutPut(Data,GenreSelected,C_list,YearStart,YearEnd):
    # Reshape location columns into one
    Data_locs = Data.melt(
        id_vars=['Year', 'Rating'],
        value_vars=C_list,
        var_name='LocType',
        value_name='Location'
    )
   
    # Drop missing locations
    Data_locs = Data_locs.dropna(subset=['Location'])
    Data_locs_Years = Data_locs[Data_locs['Year'].between(YearStart,YearEnd)]
   
#     # Take average instead of the aggregate
#     Data_locs_Years = Data_locs_Years.groupby('Location', as_index=False)['Rating'].mean()
   
    fig = px.choropleth(
        Data_locs_Years,
        locations='Location',
        color='Rating',
        title=(f"{GS} Movie Rating by Country ({YearStart}{YearEnd})")
    )
    fig.show()

GS='Animation' #GenreSelection
movie_GenreSelected=movie[movie[G_list].eq(GS).any(axis=1)]

# Iteration to compare graphs in different decade groups
MinYear=min(movie_GenreSelected['Year'])
MaxYear=max(movie_GenreSelected['Year'])
Step=5;Interval=10;
for i in range(math.ceil((MaxYear-MinYear)/Step)):
    YearEd=MaxYear-Step*i
    YearSt=(max(MinYear,YearEd-Interval))
    MapOutPut(movie_GenreSelected,GS,C_list,YearSt,YearEd)

 

No comments: