Monday, November 11, 2024

Normal Distribution Random Normal T-test Type I Error

 




 

This test is based on the standard probability density function (PDF) following a Gaussian distribution.

The PDF is configured with a mean of 0 and a standard deviation of 1 to generate the random values.

A series of these randomly generated values is then tested using a T-test to determine whether there is sufficient evidence to reject the null hypothesis (mean = 0).

In most cases, with a 95% confidence level (the standard probability level used in economics and finance), there is typically insufficient evidence to reject the null hypothesis, i.e., the mean is 0.

However, with a significantly high number of trials (e.g., 1000), under the same confidence level, there are instances where there is sufficient evidence to reject the null hypothesis, i.e., the mean is not 0.

# For statistics
import random
import numpy as np
from scipy import stats
from scipy.stats import norm
# For plotting
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
# For table output
import pandas as pd

# Setting Plot Size
plt.rcParams["figure.figsize"] = (10,7)

# Reference list
LinkList=["List of colours: https://matplotlib.org/stable/gallery/color/named_colors.html",
          "Random Normal Distribution (Gausian PDF): https://numpy.org/doc/stable/reference/random/generated/numpy.random.normal.html",
          "Normal Distribution Functions: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.t.html",
          "T-test: ",
          "   - One sample: https://builtin.com/data-science/t-test-python",
          "   - Independent: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html",
          "   - Confidence Interval:",
          "      https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data",
          "      https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.t.html",
          "Plot Filling https://matplotlib.org/stable/gallery/lines_bars_and_markers/fill_between_demo.html"]
for k in range(len(LinkList)):
    print(LinkList[k])

# Setting parameters
n=1000; # Number of elements
mu=0 # mean of the distribution
sigma=1 # Standard deviation
# Random normally distributed elements with n data points
x=np.random.normal(mu, sigma, n) # A variable of elements

# Gausian Probability Density Function (PDF)
def Gausian_PDF(bins,mu,sigma):
    g=1/(sigma*np.sqrt(2*np.pi))*np.exp(-(bins-mu)**2/(2*sigma**2)) # PDF formula
    return g

# Plotting function
def Disn_Hist(x,Confidence_Level):
    # Histogram and Probability-Density setting
    No_Bins=100 #Setting number of bins i.e. how many slices of bars
    Probability_Density, bins, ignored = plt.hist(x, No_Bins, density=True, color = "lightgreen",  label='Probability Density')
    # Mean and Standard Deviation of the sample data x
    x_bar=np.mean(x); s=np.std(x)
    # Normal distribution of the sample
    g_x=Gausian_PDF(bins,x_bar,s) # Calling PDF
    plt.plot(bins,g_x,linewidth=2, color='Green', label='Sample Distribution') # Plotting line graph of PDF
    plt.axvline(x=x_bar, label=f'x_bar is {round(x_bar,3)}', color='Green') # Sample mean
    # Normal distribution of the null hypothesis
    mu=0; sigma=1
    g_0=Gausian_PDF(bins,mu,sigma) # Calling PDF
    plt.plot(bins,g_0,linewidth=2, color='Grey', label='Null Hypothesis', linestyle='dashed') # Plotting line graph of PDF
    plt.axvline(x=x_bar, label=f'mu is {round(mu,3)}', color='Grey') # Sample mean
    # Confidence Interval: qnorm
    # Graoh Output
    plt.legend(loc="upper left")
    plt.title("Histogram & Mean Comparison")
    plt.show()

Confidence_Level=95
Disn_Hist(x,Confidence_Level)

# T-test testing the null-hypothesis
def Ttest(x, mu, Confidence_Level, Print):
    x_bar=np.mean(x)
    t,p=stats.ttest_1samp(x, mu, alternative='two-sided')
    z=stats.t.ppf((1+Confidence_Level/100)/2.,n-1) # Z-value
    CI=[x_bar-stats.sem(x)*z, x_bar+stats.sem(x)*z] # Confidence Interval
    #Plotting a Gausian PDF
    points=np.linspace(-3,3,100)
    SEM=1 #stats.sem(x)
    g_0=Gausian_PDF(points,mu,SEM)
    plt.plot(points,g_0,linewidth=2, color='Grey', label='Sample Distribution') # Plotting line graph of PDF
    plt.axvline(x=0, color='Grey') # Null-Hypothesis
    plt.title('t-test')
    plt.xlabel('t-value')
    # T-test critical point
    SigLevel=1-Confidence_Level/100
    # Lower Fill
    LowerT_Pnts=np.linspace(min(points),stats.norm.ppf(SigLevel/2))
    LowerT_Dsts=Gausian_PDF(LowerT_Pnts,0,1)
    #LowerT=[(bins[k],LowerT_Dsts[k]) for k in range(len(bins))]
    plt.fill_between(LowerT_Pnts,LowerT_Dsts, color='darkseagreen')
    # Upper Fill
    UpperT_Pnts=np.linspace(stats.norm.ppf(1-SigLevel/2),max(points))
    UpperT_Dsts=Gausian_PDF(UpperT_Pnts,0,1)
    #UpperT=[(bins[k],UpperT_Dsts[k]) for k in range(len(bins))]
    plt.fill_between(UpperT_Pnts,UpperT_Dsts, color='darkseagreen')
   
    plt.axvline(x=t, label=f't is {round(t,3)}', color='Magenta') # Null-Hypothesis
    h=['H0: mu = x_bar ∴ No sufficient evidence to reject the null hypothesis (H0).',
       'H1: mu ≠ x_bar ∴ Sufficient evidence to reject the null hypothesis (H0).']
    if Print==1:
        print(f'Confidence Interval: {CI}')
        print(f'T-value: {t},  p-value: {p}')
        print(h[int(np.floor((1-p)-0.05))])
    return t,p

print('\n')
print('T-test testing the null-hypothesis')
Confidence_Level=95 #%
ShowingTstatDetail=1
x=np.random.normal(mu, sigma, n) # A variable of elements
plt.figure(); Ttest(x, 0, Confidence_Level, ShowingTstatDetail)

# Why there is sometimes no significant evidence proving it is zero.
def Type_I_Error_test(Confidence_Level,Number_of_datapoints, Number_of_trial,Print):
    X_bar=[];T=[];P=[];SigLevel=1-Confidence_Level/100
    for l in range(Number_of_trial):
        mu=0; sigma=1; n=Number_of_datapoints
        x=np.random.normal(mu, sigma, n)
        t,p=stats.ttest_1samp(x, mu)
        if p<SigLevel:
            #plt.figure(); Disn_Hist(x,Confidence_Level)
            plt.figure(); Ttest(x, 0, Confidence_Level, Print)
            x_bar=np.mean(x)
            X_bar.append(x_bar)
            T.append(t)
            P.append(p)
    return X_bar,T,P

print('\n')
print('Why there is sometimes no significant evidence proving it is zero.')
Confidence_Level=95 #%
Number_of_datapoints=1000
Number_of_trial=100
ShowingTstatDetail=0
X_bar,T,P=Type_I_Error_test(Confidence_Level, Number_of_datapoints, Number_of_trial, ShowingTstatDetail)
print(f"There are {len(P)} times out of {Number_of_trial} times of the trial not be able to reject the null hypothesis.")
print(f"The percentage of no sufficient evidence to reject the null hypothesis is {round(len(P)/Number_of_trial,2)}%")
d = {'x_bar':X_bar,'t':T, 'p':P}
df = pd.DataFrame(data=d)
display(df)