# Importing all the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Loading the dataset and dropping first column as it is just an index (name = 'Rank')
covid_data = pd.read_csv('covid_clinical_trials.csv')
covid_data = covid_data.drop(columns=['Rank'])

#viewing the shape and first few rows of the dataset to get a brief overview
print("=" * 50)
print("COVID-19 Clinical Trials Dataset")
print("=" * 50)
print(f"Shape of the dataset: {covid_data.shape[0]} rows and {covid_data.shape[1]} columns")
print(f"\nColumns in the dataset: {list(covid_data.columns)}")
print("\nFirst 5 rows of the dataset:")
covid_data.head()

==================================================
COVID-19 Clinical Trials Dataset
==================================================
Shape of the dataset: 5783 rows and 26 columns

Columns in the dataset: ['NCT Number', 'Title', 'Acronym', 'Status', 'Study Results', 'Conditions', 'Interventions', 'Outcome Measures', 'Sponsor/Collaborators', 'Gender', 'Age', 'Phases', 'Enrollment', 'Funded Bys', 'Study Type', 'Study Designs', 'Other IDs', 'Start Date', 'Primary Completion Date', 'Completion Date', 'First Posted', 'Results First Posted', 'Last Update Posted', 'Locations', 'Study Documents', 'URL']

First 5 rows of the dataset:

# Checking for missing values in the dataset
print("=" * 50)
print("Percentage Of Missing values in each column:")
print((covid_data.isnull().sum() / len(covid_data)) * 100)

# Duplicates in the dataset
print("=" * 50)
print(f"Number of Duplicates: {covid_data.duplicated().sum()}")

# Data types of each column
print("=" * 50)
print("Data Types:")
print(covid_data.dtypes)


# Summary statistics of the dataset
print("=" * 50)
print("Summary Statistics:")
covid_data.describe(include='all')

==================================================
Percentage Of Missing values in each column:
NCT Number                  0.000000
Title                       0.000000
Acronym                    57.115684
Status                      0.000000
Study Results               0.000000
Conditions                  0.000000
Interventions              15.320768
Outcome Measures            0.605222
Sponsor/Collaborators       0.000000
Gender                      0.172921
Age                         0.000000
Phases                     42.555767
Enrollment                  0.587930
Funded Bys                  0.000000
Study Type                  0.000000
Study Designs               0.605222
Other IDs                   0.017292
Start Date                  0.587930
Primary Completion Date     0.622514
Completion Date             0.622514
First Posted                0.000000
Results First Posted       99.377486
Last Update Posted          0.000000
Locations                  10.115857
Study Documents            96.852845
URL                         0.000000
dtype: float64
==================================================
Number of Duplicates: 0
==================================================
Data Types:
NCT Number                  object
Title                       object
Acronym                     object
Status                      object
Study Results               object
Conditions                  object
Interventions               object
Outcome Measures            object
Sponsor/Collaborators       object
Gender                      object
Age                         object
Phases                      object
Enrollment                 float64
Funded Bys                  object
Study Type                  object
Study Designs               object
Other IDs                   object
Start Date                  object
Primary Completion Date     object
Completion Date             object
First Posted                object
Results First Posted        object
Last Update Posted          object
Locations                   object
Study Documents             object
URL                         object
dtype: object
==================================================
Summary Statistics:

# List of Columns with missing values
missing_cols = covid_data.columns[covid_data.isnull().any()].tolist()
print("List of Columns with missing values:")
print(missing_cols)

List of Columns with missing values:
['Acronym', 'Interventions', 'Outcome Measures', 'Gender', 'Phases', 'Enrollment', 'Study Designs', 'Other IDs', 'Start Date', 'Primary Completion Date', 'Completion Date', 'Results First Posted', 'Locations', 'Study Documents']

# Interventions
# creating a mask for missingness in interventions
covid_data['Interventions_Missing'] = covid_data['Interventions'].isna().astype(int)
#checking relationship between missingness in interventions and study type
print(covid_data.groupby('Study Type')['Interventions_Missing'].mean())

#<-- missingness is due to observational study hence, MAR

Study Type
Expanded Access                                                        0.00000
Expanded Access:Individual Patients                                    0.00000
Expanded Access:Individual Patients|Intermediate-size Population       0.00000
Expanded Access:Individual Patients|Treatment IND/Protocol             0.00000
Expanded Access:Intermediate-size Population                           0.00000
Expanded Access:Intermediate-size Population|Treatment IND/Protocol    0.00000
Expanded Access:Treatment IND/Protocol                                 0.00000
Interventional                                                         0.00000
Observational                                                          0.36506
Name: Interventions_Missing, dtype: float64

# Phases
# creating a binary missing indicator for missingness in phases
covid_data['Phases_Missing'] = covid_data['Phases'].isna()
#creating a crosstab contingency table
phases_crosstab = pd.crosstab(covid_data['Study Type'], covid_data['Phases_Missing'])
# view as percentages (normalized)
phases_crosstab_percent = phases_crosstab.div(phases_crosstab.sum(axis=1), axis=0) * 100
print("Crosstab of Study Type vs Phases Missingness (in percentages):")
print(phases_crosstab_percent)

#<---correlation with the Study Type hence, MAR

Crosstab of Study Type vs Phases Missingness (in percentages):
Phases_Missing                                      False  True 
Study Type                                                      
Expanded Access                                       0.0  100.0
Expanded Access:Individual Patients                   0.0  100.0
Expanded Access:Individual Patients|Intermediat...    0.0  100.0
Expanded Access:Individual Patients|Treatment I...    0.0  100.0
Expanded Access:Intermediate-size Population          0.0  100.0
Expanded Access:Intermediate-size Population|Tr...    0.0  100.0
Expanded Access:Treatment IND/Protocol                0.0  100.0
Interventional                                      100.0    0.0
Observational                                         0.0  100.0

# Locations
# creating a binary missing indicator for missingness in locations
covid_data['Locations_Missing'] = covid_data['Locations'].isna()
# checking relationship between missingness in locations and Status
locations_crosstab = pd.crosstab(covid_data['Status'], covid_data['Locations_Missing'])
print("Crosstab of Status vs Locations Missingness:")
print(locations_crosstab)

#<--- almost entirely dependent on the Status of the trial. Hence, MAR.

Crosstab of Status vs Locations Missingness:
Locations_Missing          False  True 
Status                                 
Active, not recruiting       523      3
Approved for marketing         2      0
Available                     11      8
Completed                   1022      3
Enrolling by invitation      181      0
No longer available            7      5
Not yet recruiting           481    523
Recruiting                  2805      0
Suspended                     27      0
Temporarily not available      0      1
Terminated                    73      1
Withdrawn                     66     41

# Standardizing NaN in Acronym column with 'Unavailable'
covid_data['Acronym'] = covid_data['Acronym'].fillna('Unavailable')

# Standardizing NaN in 'Other IDs' column with 'Unavailable'
covid_data['Other IDs'] = covid_data['Other IDs'].fillna('Unavailable')

# Standardizing NaN in 'Location' with 'Documents Unavailable'
covid_data['Locations'] = covid_data['Locations'].fillna('Not Yet Finalized Or Withdrawn')

# Standardizing NaN in "Study Documents" with "Documents Unavailable"
covid_data['Study Documents'] = covid_data['Study Documents'].fillna('Documents Unavailable')

# standardizing NaN in Phases column to "Not Applicable"
covid_data['Phases'] = covid_data['Phases'].fillna('Not Applicable')

# Converting Phases column to categorical
covid_data['Phases'] = covid_data['Phases'].astype('category')

# standardizing missing values in Interventions column
covid_data['Interventions'] = covid_data['Interventions'].fillna('None(Observational Study)')

# Dropping Null rows in "outcome measures", "Gender", and "Study Designs"
covid_data = covid_data.dropna(subset=['Outcome Measures', 'Gender', 'Study Designs'])

# Checking Unique values in Gender column
print("Unique values in Gender column:")
print(covid_data['Gender'].unique())

# Converting Gender Column to Categorical
covid_data['Gender'] = covid_data['Gender'].astype('category')

Unique values in Gender column:
['All' 'Female' 'Male']

# converting Enrollment to int and standardizing NaN with median
covid_data['Enrollment'] = covid_data['Enrollment'].fillna(covid_data['Enrollment'].median()).astype(int)

# checking the distribution
plt.figure(figsize=(10, 6))
sns.histplot(covid_data['Enrollment'], bins=30, kde=True)
plt.title('Distribution of Enrollment')
plt.xlabel('Enrollment')
plt.ylabel('Frequency')
plt.show()

# checking outliers using boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(x=covid_data['Enrollment'])
plt.title('Boxplot of Enrollment')
plt.xlabel('Enrollment')
plt.show()

# converting Start Date, Primary Completion Date, Completion Date, First Posted, Results First Posted and Last Update Posted to datetime
covid_data['Start Date'] = pd.to_datetime(covid_data['Start Date'], errors='coerce')
covid_data['Primary Completion Date'] = pd.to_datetime(covid_data['Primary Completion Date'], errors='coerce')
covid_data['Completion Date'] = pd.to_datetime(covid_data['Completion Date'], errors='coerce')
covid_data['First Posted'] = pd.to_datetime(covid_data['First Posted'], errors='coerce')
covid_data['Results First Posted'] = pd.to_datetime(covid_data['Results First Posted'], errors='coerce')
covid_data['Last Update Posted'] = pd.to_datetime(covid_data['Last Update Posted'], errors='coerce')

# standardizing missing values in all (except "Results First Posted") with respective median date
covid_data['Start Date'] = covid_data['Start Date'].fillna(covid_data['Start Date'].median())
covid_data['Primary Completion Date'] = covid_data['Primary Completion Date'].fillna(covid_data['Primary Completion Date'].median())
covid_data['Completion Date'] = covid_data['Completion Date'].fillna(covid_data['Completion Date'].median())

# Dropping Featured Columns
covid_data = covid_data.drop(columns=['Interventions_Missing', 'Phases_Missing', 'Locations_Missing'])

# Checking if all missing values have been handled
print("=" * 50)
print("Missing values in each column after handling:")
print("=" * 50)
print(covid_data.isnull().sum())
print(covid_data.shape)

==================================================
Missing values in each column after handling:
==================================================
NCT Number                    0
Title                         0
Acronym                       0
Status                        0
Study Results                 0
Conditions                    0
Interventions                 0
Outcome Measures              0
Sponsor/Collaborators         0
Gender                        0
Age                           0
Phases                        0
Enrollment                    0
Funded Bys                    0
Study Type                    0
Study Designs                 0
Other IDs                     0
Start Date                    0
Primary Completion Date       0
Completion Date               0
First Posted                  0
Results First Posted       5703
Last Update Posted            0
Locations                     0
Study Documents               0
URL                           0
dtype: int64
(5739, 26)

# Confirming Data Validation
covid_data.dtypes

NCT Number                         object
Title                              object
Acronym                            object
Status                             object
Study Results                      object
Conditions                         object
Interventions                      object
Outcome Measures                   object
Sponsor/Collaborators              object
Gender                           category
Age                                object
Phases                           category
Enrollment                          int64
Funded Bys                         object
Study Type                         object
Study Designs                      object
Other IDs                          object
Start Date                 datetime64[ns]
Primary Completion Date    datetime64[ns]
Completion Date            datetime64[ns]
First Posted               datetime64[ns]
Results First Posted       datetime64[ns]
Last Update Posted         datetime64[ns]
Locations                          object
Study Documents                    object
URL                                object
dtype: object

# sampling the content in Age to get an overview of the mess
list(covid_data['Age'].sample(5))

['18 Years and older \xa0 (Adult, Older Adult)',
 '18 Years and older \xa0 (Adult, Older Adult)',
 '18 Years and older \xa0 (Adult, Older Adult)',
 '18 Years and older \xa0 (Adult, Older Adult)',
 '18 Years and older \xa0 (Adult, Older Adult)']

# Importing regex
import re
def engineered_age_groups(age_string):
    if pd.isna(age_string):
        return pd.series([False, False, False])
    
    # Extracting all numbers from the string
    numbers = [int(n) for n in re.findall(r'\d+', age_string)]
    
    if not numbers:
        return pd.Series([False, False, False])
    
    # If only one number exists, i assume its the min age
    min_age = min(numbers)
    max_age = max(numbers) if len(numbers) > 1 else 150 #<-- 150 as a place holder for no upper limit

    # Applying Logic
    is_child = min_age <= 17
    is_older_adult = max_age >= 65
    is_adult = not (max_age < 18 or min_age > 64)

    return pd.Series([is_child, is_adult, is_older_adult])

# applying to the dataframe
age_cols = ['is_child', 'is_adult', 'is_older_adult']
covid_data[age_cols] = covid_data['Age'].apply(engineered_age_groups)

# confirming effect
covid_data.head(1) #<--- Good to go !!!

# sampling the content of study design column
list(covid_data['Study Designs'].sample(5))

['Allocation: N/A|Intervention Model: Single Group Assignment|Masking: None (Open Label)|Primary Purpose: Treatment',
 'Allocation: Randomized|Intervention Model: Parallel Assignment|Masking: Double (Participant, Investigator)|Primary Purpose: Treatment',
 'Allocation: N/A|Intervention Model: Single Group Assignment|Masking: None (Open Label)|Primary Purpose: Screening',
 'Observational Model: Cohort|Time Perspective: Other',
 'Observational Model: Cohort|Time Perspective: Retrospective']

# Spliting by the pipe '|' and then spliting by the colon ':'
def parse_design(design_str):
    if pd.isna(design_str):
        return {}
    # Creating a dictionary by splitting each pair
    return {item.split(': ')[0]: item.split(': ')[1] 
            for item in design_str.split('|') if ': ' in item}

# Applying the function to create a column of dictionaries
design_dicts = covid_data['Study Designs'].apply(parse_design)

# Expanding the dictionaries into a separate DataFrame and joining it back
design_df = pd.json_normalize(design_dicts)
covid_data = pd.concat([covid_data, design_df], axis=1)

# Now I have separate columns: 'Allocation', 'Masking', etc.

# confirming effect
covid_data.head(1)

# checking for missingness and shape after extracting new features
print(covid_data.isnull().sum())
print(covid_data.shape)

NCT Number                   43
Title                        43
Acronym                      43
Status                       43
Study Results                43
Conditions                   43
Interventions                43
Outcome Measures             43
Sponsor/Collaborators        43
Gender                       43
Age                          43
Phases                       43
Enrollment                   43
Funded Bys                   43
Study Type                   43
Study Designs                43
Other IDs                    43
Start Date                   43
Primary Completion Date      43
Completion Date              43
First Posted                 43
Results First Posted       5746
Last Update Posted           43
Locations                    43
Study Documents              43
URL                          43
is_child                     43
is_adult                     43
is_older_adult               43
Allocation                 2463
Intervention Model         2460
Masking                    2460
Primary Purpose            2460
Observational Model        3365
Time Perspective           3365
dtype: int64
(5782, 35)

# dropping addtional rows created as artifacts
covid_data = covid_data.dropna(subset=['NCT Number'])

# standardizing NaN in featured columns to "Unavailable"
featured_cols = ['Allocation', 'Intervention Model', 'Masking', 'Primary Purpose', 'Observational Model', 'Time Perspective']
covid_data[featured_cols] = covid_data[featured_cols].fillna('Unavailable')

# Replacing "N/A" strings in Allocation with 'Unavailable" for uniformity
covid_data['Allocation']= covid_data['Allocation'].str.replace('N/A', "Unavailable")

# converting enrollment back to int
covid_data['Enrollment'] = covid_data['Enrollment'].astype(int)

# converting status to categorical
covid_data['Status'] = covid_data['Status'].astype('category')

# final confirmation of missingness and shape
print(covid_data.isnull().sum())
print(covid_data.shape) #<-- I am good to go!...ignoring Results first posted

NCT Number                    0
Title                         0
Acronym                       0
Status                        0
Study Results                 0
Conditions                    0
Interventions                 0
Outcome Measures              0
Sponsor/Collaborators         0
Gender                        0
Age                           0
Phases                        0
Enrollment                    0
Funded Bys                    0
Study Type                    0
Study Designs                 0
Other IDs                     0
Start Date                    0
Primary Completion Date       0
Completion Date               0
First Posted                  0
Results First Posted       5703
Last Update Posted            0
Locations                     0
Study Documents               0
URL                           0
is_child                      0
is_adult                      0
is_older_adult                0
Allocation                    0
Intervention Model            0
Masking                       0
Primary Purpose               0
Observational Model           0
Time Perspective              0
dtype: int64
(5739, 35)

# setting style
sns.set_theme(style='whitegrid')

plt.Figure(figsize=(12, 6))

# to order by frequency
status_order = covid_data['Status'].value_counts(ascending=True).index
ax = sns.countplot(data=covid_data, y='Status', order=status_order, hue='Status', palette='viridis', legend=False)

#adding labels to each bar container
for container in ax.containers:
    ax.bar_label(container, padding=3, fontsize=8)

plt.title('Distribution of Covid-19 Clinical Trial Status', fontsize=15, fontweight='bold')
plt.xlabel('Number of Trials')
plt.ylabel('Current Status')
plt.show()

# Calculating Duration in Days
covid_data['duration_days'] = (covid_data['Completion Date'] - covid_data['Start Date']).dt.days

# Filtering out unrealistic data (e.g., negative duration or trials over 10 years)
# In COVID research, anything > 2000 days might be a data entry error or long-term follow-up
Trial_Duration_filtered = covid_data[(covid_data['duration_days'] > 0) & (covid_data['duration_days'] < 3650)].copy()

# plotting the distribution
sns.histplot(Trial_Duration_filtered['duration_days'], kde=True, color='teal', bins=50)

# adding Median line to show typical trial 
plt.axvline(Trial_Duration_filtered['duration_days'].median(), color='red', linestyle='--', label=f'Median: {Trial_Duration_filtered['duration_days'].median():.0f} days')

plt.title('Typical Duration of COVID-19 Clinical Trials', fontsize=15, fontweight='bold')
plt.xlabel('Duration in Days')
plt.ylabel('Frequency')
plt.legend()
plt.show()

plt.figure(figsize=(10, 6))

# Using the fixed hue/legend syntax to avoid warnings
ax = sns.countplot(
    data=covid_data, 
    x='Gender', 
    hue='Gender', 
    palette='Set2', 
    legend=False,
    order=covid_data['Gender'].value_counts(ascending=True).index
)
# adding labels to each bar container
for container in ax.containers:
    ax.bar_label(container, padding=3)
plt.title('Gender Eligibility Distribution', fontsize=14, fontweight='bold')
plt.ylabel('Number of Trials')
plt.xlabel('Eligible Gender')
plt.show()

# preparing data and defining my labels
age_summary = covid_data[['is_child', 'is_adult', 'is_older_adult']].sum()
labels = ['Children (0-17)', 'Adults (18-64)', 'Older Adults (65+)']

plt.figure(figsize=(10, 6))

# Creating the barplot
ax = sns.barplot(
    x=labels, 
    y=age_summary.values, 
    hue=labels, 
    palette=['#4C72B0', '#DD8452', '#55A868'],
    legend=False
)

# Adding count labels
for container in ax.containers:
    ax.bar_label(container, padding=3)

plt.title('Trial Reach Across Targeted Age Groups (Total Counts)', fontsize=14, fontweight='bold')
plt.ylabel('Number of Trials Supporting Group')
plt.show()

# Creating a column that counts how many age groups are included in one trial
covid_data['age_inclusivity_count'] = covid_data[['is_child', 'is_adult', 'is_older_adult']].sum(axis=1)

# Categorize the trials
def get_inclusivity_label(count):
    if count == 3: return 'Fully Inclusive (All Age Groups)'
    if count == 2: return 'Broad (2 Age Groups)'
    return 'Narrow (1 Age Group Only)'

covid_data['inclusivity_label'] = covid_data['age_inclusivity_count'].apply(get_inclusivity_label)

# Visualizing the levels of inclusivity
plt.figure(figsize=(8, 8))
covid_data['inclusivity_label'].value_counts().plot.pie(
    autopct='%1.1f%%', 
    colors=['#6acc64', '#4878d0', '#d65f5f'], 
    startangle=140,
    explode=(0.05, 0, 0)
)
plt.title('Proportion of Multi-Age Inclusivity in COVID Trials', fontweight='bold'
)
plt.ylabel('') # Removing y-label for clean pie chart
plt.show()

# Extracting the first category listed in the interventions string
def extract_intervention_type(x):
    if pd.isna(x) or x == 'None': return 'No Intervention (Observational)'
    # Split by colon to get 'Drug', 'Device', etc.
    return x.split(':')[0].strip()

covid_data['Primary_Intervention_Type'] = covid_data['Interventions'].apply(extract_intervention_type)

# Visualizing
plt.figure(figsize=(12, 6))
order = covid_data['Primary_Intervention_Type'].value_counts().index
ax = sns.countplot(data=covid_data, y='Primary_Intervention_Type', order=order, hue='Primary_Intervention_Type', palette='magma', legend=False)

# Adding counts
for container in ax.containers:
    ax.bar_label(container, padding=3)

plt.title('Primary Intervention Types in COVID-19 Research', fontsize=14, fontweight='bold')
plt.xlabel('Number of Trials')
plt.ylabel('Intervention Category')
plt.show()

# Defining the logical order of clinical phases
phase_order = [
    'Early Phase 1', 'Phase 1', 'Phase 1|Phase 2', 
    'Phase 2', 'Phase 2|Phase 3', 'Phase 3', 'Phase 4', 
    'Not Applicable'
]

plt.figure(figsize=(12, 6))
ax = sns.countplot(data=covid_data, x='Phases', order=phase_order, palette='viridis', hue='Phases', legend=False)

# Adding counts
for container in ax.containers:
    ax.bar_label(container, padding=3)

plt.title('Clinical Phase Progression: The Drug Development Funnel', fontsize=14, fontweight='bold')
plt.xticks(rotation=45)
plt.ylabel('Number of Trials')
plt.show()

# Filtering for positive enrollment values for the log scale
covid_Trial_enrollment = covid_data[covid_data['Enrollment'] > 0].copy()

plt.figure(figsize=(12, 6))

# Use log_scale=True to see the full range from 10 to 50,000+
ax = sns.histplot(data=covid_Trial_enrollment, x='Enrollment', log_scale=True, kde=True, color='royalblue')

plt.title('Distribution of Trial Enrollment Scale (Log Scale)', fontsize=16, fontweight='bold')
plt.xlabel('Number of Participants (Log 10 Scale)', fontsize=12, fontweight='bold')
plt.ylabel('Frequency of Trials', fontsize=12, fontweight='bold')

plt.show()

# Creating a 1x2 subplot for Rigor analysis
fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Plot 1: Masking Distribution
mask_order = covid_data['Masking'].value_counts().index
sns.countplot(ax=axes[0], data=covid_data, y='Masking', order=mask_order, hue='Masking', palette='crest', legend=False)
axes[0].set_title('Distribution of Masking (Blinding)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Count', fontweight='bold')
axes[0].set_ylabel('Masking Type', fontweight='bold')

# Plot 2: Allocation Distribution
alloc_order = covid_data['Allocation'].value_counts().index
sns.countplot(ax=axes[1], data=covid_data, y='Allocation', order=alloc_order, hue='Allocation', palette='magma', legend=False)
axes[1].set_title('Distribution of Participant Allocation', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Count', fontweight='bold')
axes[1].set_ylabel('Allocation Type', fontweight='bold')

# Add bar labels to both
for ax in axes:
    for container in ax.containers:
        ax.bar_label(container, padding=3, fontweight='bold')

plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))

# Getting the Top 10 Lead Sponsors
top_sponsors = covid_data['Sponsor/Collaborators'].value_counts().head(10)

# 2. Create the plot
ax = sns.barplot(
    y=top_sponsors.index, 
    x=top_sponsors.values, 
    hue=top_sponsors.index, 
    palette='viridis', 
    legend=False
)

# Add bold counts to the bars
for container in ax.containers:
    ax.bar_label(container, padding=3, fontweight='bold')

plt.title('Top 10 Lead Sponsors of COVID-19 Clinical Trials', fontsize=16, fontweight='bold')
plt.xlabel('Number of Trials Led', fontsize=12, fontweight='bold')
plt.ylabel('Organization Name', fontsize=12, fontweight='bold')

plt.show()

plt.figure(figsize=(10, 6))

# Counting the funder types
funder_counts = covid_data['Funded Bys'].value_counts()

ax = sns.countplot(
    data=covid_data, 
    y='Funded Bys', 
    order=funder_counts.index, 
    hue='Funded Bys', 
    palette='Set1', 
    legend=False
)

# Add labels
for container in ax.containers:
    ax.bar_label(container, padding=3, fontweight='bold')

plt.title('The Funding Landscape', fontsize=16, fontweight='bold')
plt.ylabel('Funder Category', fontsize=12, fontweight='bold')
plt.xlabel('Number of Trials', fontsize=12, fontweight='bold')


plt.show()

Field	Description	Type
`NCT Number`	The unique identification code given to each clinical study upon registration at ClinicalTrials.gov. The format is "NCT" followed by an 8-digit number (for example, NCT00000419)	Nominal Categorical
`Title`	The official title of a protocol used to identify a clinical study or a short title written in language intended for the lay public	Nominal Categorical
`Acronym`	The acronym or initials used to identify a clinical study (not all studies have one)	Nominal categorical
`Status`	Indicates the current recruitment status: 1. Not yet recruiting; 2. Recruiting; 3. Enrolling by invitation; 4. Active, not recruiting; 5. Suspended; 6. Terminated; 7. Completed; 8. Withdrawn; 9. Unknown	Nominal categorical
`Study Results`	A study record that includes the summary results posted in the ClinicalTrials.gov results database. Summary results information includes participant flow, baseline characteristics, outcome measures, and adverse events (including serious adverse events)	Nominal categorical
`Conditions`	The disease, disorder, syndrome, illness, or injury that is being studied. On ClinicalTrials.gov, conditions may also include other health-related issues, such as lifespan, quality of life, and health risks	Nominal categorical
`Interventions`	A process or action that is the focus of a clinical study. Interventions include drugs, medical devices, procedures, vaccines, and other products that are either investigational or already available. Interventions can also include noninvasive approaches, such as education or modifying diet and exercise	Nominal Categorical
`Outcome Measures`	For clinical trials, a planned measurement described in the protocol that is used to determine the effect of an intervention/treatment on participants. For observational studies, a measurement or observation that is used to describe patterns of diseases or traits, or associations with exposures, risk factors, or treatment. Types of outcome measures include primary outcome measure and secondary outcome measure	Nominal Categorical
`Sponsor/Collaborators`	The organization or person who initiates the study and who has authority and control over the study	Nominal categorical
`Gender`	Male, Female and All	Nominal categorical
`Age`	A type of eligibility criteria that indicates the age a person must be to participate in a clinical study.This may be indicated by a specific age or the following age groups: 1. Child (birth-17). 2. Adult (18-64). 3. Older Adult (65+)	Ordinal Categorical
`Phases`	The stage of a clinical trial studying a drug or biological product, based on definitions developed by the U.S. Food and Drug Administration (FDA). The phase is based on the study's objective, the number of participants, and other characteristics. There are five phases: Early Phase 1 (formerly listed as Phase 0), Phase 1, Phase 2, Phase 3, and Phase 4. Not Applicable is used to describe trials without FDA-defined phases, including trials of devices or behavioral interventions	Nominal categorical
`Enrollment`	The number of participants in a clinical study. The "estimated" enrollment is the target number of participants that the researchers need for the study	Discrete numeric
`Funded Bys`	Describes the organization that provides funding or support for a clinical study. This support may include activities related to funding, design, implementation, data analysis, or reporting. Organizations listed as sponsors and collaborators for a study are considered the funders of the study. ClinicalTrials.gov refers to four types of funders: 1. U.S. National Institutes of Health 2. Other U.S. Federal agencies (for example, Food and Drug Administration, Centers for Disease Control and Prevention, or U.S. Department of Veterans Affairs). 3. Industry (for example: pharmaceutical and device companies). 4. All others (including individuals, universities, and community-based organizations)	Nominal categorical
`Study Type`	Describes the nature of a clinical study. Study types include interventional studies (also called clinical trials), observational studies (including patient registries), and expanded access	Nominal Categorical
`Study Designs`	The investigative methods and strategies used in the clinical study	Nominal categorical
`Other IDs`	Identifiers or ID numbers other than the NCT number that are assigned to a clinical study by the study's sponsor, funders, or others. These numbers may include unique identifiers from other trial registries and National Institutes of Health grant numbers	Nominal Categorical
`Start Date`	The actual date on which the first participant was enrolled in a clinical study. The "estimated" study start date is the date that the researchers think will be the study start date	Interval
`Primary Completion Date`	The date on which the last participant in a clinical study was examined or received an intervention to collect final data for the primary outcome measure. Whether the clinical study ended according to the protocol or was terminated does not affect this date. For clinical studies with more than one primary outcome measure with different completion dates, this term refers to the date on which data collection is completed for all the primary outcome measures. The "estimated" primary completion date is the date that the researchers think will be the primary completion date for the study	Interval
`First Posted`	The date on which the study record was first available on ClinicalTrials.gov after National Library of Medicine (NLM) quality control (QC) review has concluded. There is typically a delay of a few days between the date the study sponsor or investigator submitted the study record and the first posted date.	Interval
`Results First Posted`	The date on which summary results information was first available on ClinicalTrials.gov after National Library of Medicine (NLM) quality control (QC) review has concluded. There is typically a delay between the date the study sponsor or investigator first submits summary results information (the results first submitted date) and the results first posted date. Some results information may be available at an earlier date if Results First Posted with QC Comments	Interval
`Last Update Posted`	The most recent date on which changes to a study record were made available on ClinicalTrials.gov. There may be a delay between when the changes were submitted to ClinicalTrials.gov by the study's sponsor or investigator (the last update submitted date) and the last update posted date	Interval
`Locations`	A place where a research site for a clinical study can be found. Location information can be searched using a facility name, a city, state, zip code, or country. A location where a study is being conducted may also include contact information	Nominal categorical
`Study Documents`	Refers to the type of documents that the study sponsor or principal investigator may add to their study record. These include a study protocol, statistical analysis plan, and informed consent form	Nominal categorical
`URL`	Link to study	Nominal categorical

	NCT Number	Title	Acronym	Status	Study Results	Conditions	Interventions	Outcome Measures	Sponsor/Collaborators	Gender	...	Other IDs	Start Date	Primary Completion Date	Completion Date	First Posted	Results First Posted	Last Update Posted	Locations	Study Documents	URL
0	NCT04785898	Diagnostic Performance of the ID Now™ COVID-19...	COVID-IDNow	Active, not recruiting	No Results Available	Covid19	Diagnostic Test: ID Now™ COVID-19 Screening Test	Evaluate the diagnostic performance of the ID ...	Groupe Hospitalier Paris Saint Joseph	All	...	COVID-IDNow	November 9, 2020	December 22, 2020	April 30, 2021	March 8, 2021	NaN	March 8, 2021	Groupe Hospitalier Paris Saint-Joseph, Paris, ...	NaN	https://ClinicalTrials.gov/show/NCT04785898
1	NCT04595136	Study to Evaluate the Efficacy of COVID19-0001...	COVID-19	Not yet recruiting	No Results Available	SARS-CoV-2 Infection	Drug: Drug COVID19-0001-USR\|Drug: normal saline	Change on viral load results from baseline aft...	United Medical Specialties	All	...	COVID19-0001-USR	November 2, 2020	December 15, 2020	January 29, 2021	October 20, 2020	NaN	October 20, 2020	Cimedical, Barranquilla, Atlantico, Colombia	NaN	https://ClinicalTrials.gov/show/NCT04595136
2	NCT04395482	Lung CT Scan Analysis of SARS-CoV2 Induced Lun...	TAC-COVID19	Recruiting	No Results Available	covid19	Other: Lung CT scan analysis in COVID-19 patients	A qualitative analysis of parenchymal lung dam...	University of Milano Bicocca	All	...	TAC-COVID19	May 7, 2020	June 15, 2021	June 15, 2021	May 20, 2020	NaN	November 9, 2020	Ospedale Papa Giovanni XXIII, Bergamo, Italy\|P...	NaN	https://ClinicalTrials.gov/show/NCT04395482
3	NCT04416061	The Role of a Private Hospital in Hong Kong Am...	COVID-19	Active, not recruiting	No Results Available	COVID	Diagnostic Test: COVID 19 Diagnostic Test	Proportion of asymptomatic subjects\|Proportion...	Hong Kong Sanatorium & Hospital	All	...	RC-2020-08	May 25, 2020	July 31, 2020	August 31, 2020	June 4, 2020	NaN	June 4, 2020	Hong Kong Sanatorium & Hospital, Hong Kong, Ho...	NaN	https://ClinicalTrials.gov/show/NCT04416061
4	NCT04395924	Maternal-foetal Transmission of SARS-Cov-2	TMF-COVID-19	Recruiting	No Results Available	Maternal Fetal Infection Transmission\|COVID-19...	Diagnostic Test: Diagnosis of SARS-Cov2 by RT-...	COVID-19 by positive PCR in cord blood and / o...	Centre Hospitalier Régional d'Orléans\|Centre d...	Female	...	CHRO-2020-10	May 5, 2020	May 2021	May 2021	May 20, 2020	NaN	June 4, 2020	CHR Orléans, Orléans, France	NaN	https://ClinicalTrials.gov/show/NCT04395924

Column	Perc of Missingness	Classification	Decision
`Acronym`	57.115684	MCAR	Since not all studies have an acronym, i will be replacing NaN with Unvailable
`Interventions`	15.320768	MAR	since missingness is correlated with observational study in `study type`, i will be inputing as None(Observational)
`Outcome Measure`	0.605222	MCAR	Drop rows
`Gender`	0.172921	MCAR	Drop rows and convert to Nominal categorical type
`Phases`	42.555767	MAR	missingness is correlated with `study type` as uninterventional studies do not have a phase, i will standardizing as "Not applicable" and converting to nominal categorical type
`Enrollment`	0.587930	MCAR	Convert to int type and Median inputation
`Study Designs`	0.605222	MCAR	Drop Rows
`Other IDs`	0.017292	MCAR	Input as "unavailable"
`Start Date`	0.587930	MCAR	Convert to Datetime type and Median inputation on missingness
`Primary Completion Date`	0.622514	MCAR	Convert to Datetime type and Median inputation on missingness
`Completion Date`	0.622514	MCAR	Convert to Datetime type and Median inputation on missingness
`Results First Posted`	99.377486	MAR	Convert to Datetime type and standardizing missingness with "NaT"
`Locations`	10.115857	MAR	Standardizing with "Location Not Yet Finalized or Withdrawn"
`Study Documents`	96.852845	MAR	standardizing with "Document unavailabe"

PROJECT TITLE: Exploratory Data Analysis On COVID-19 Clinical Trials¶

OBJECTIVES¶

Dataset Overview¶

Importing libraries and loading dataset¶

Data Cleaning and Validation¶

Handling Missing Data¶

Investigating Missingness in Cols Suspected to be due to MAR or MNAR¶

Classification and Decision on Missingness in Columns with missing values¶

Effecting Decision¶

Feature Engineering¶

Exploratory Data Analysis¶

Findings¶

1) Status Distribution & Trial Duration Status Trends¶

2) Demographic Inclusivity Gender:¶

3) Intervention Profiles & Phase Progression Intervention Types:¶

4) Enrollment Scale & Design Rigor Enrollment Scale:¶

5) Funding Landscape & Sponsors Dominant Sector:¶

Conclusion¶

Project Files
Visit my repo Github repo to access full project files

	NCT Number	Title	Acronym	Status	Study Results	Conditions	Interventions	Outcome Measures	Sponsor/Collaborators	Gender	...	Other IDs	Start Date	Primary Completion Date	Completion Date	First Posted	Results First Posted	Last Update Posted	Locations	Study Documents	URL
count	5783	5783	2480	5783	5783	5783	4897	5748	5783	5773	...	5782	5749	5747	5747	5783	36	5783	5198	182	5783
unique	5783	5775	2338	12	2	3067	4337	5687	3631	3	...	5734	654	877	978	438	33	269	4255	182	5783
top	NCT04785898	Study Assessing Vagus Nerve Stimulation in CoV...	COVID-19	Recruiting	No Results Available	COVID-19	Other: No intervention	Mortality	Assistance Publique - Hôpitaux de Paris	All	...	COVID-19	May 1, 2020	December 31, 2020	December 31, 2021	April 24, 2020	November 4, 2020	April 8, 2021	Uhmontpellier, Montpellier, France	"Statistical Analysis Plan", https://ClinicalT...	https://ClinicalTrials.gov/show/NCT04785898
freq	1	2	47	2805	5747	720	32	5	78	5567	...	6	113	122	179	108	2	109	19	1	1
mean	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
std	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
min	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25%	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
50%	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
75%	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
max	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

PROJECT TITLE: Exploratory Data Analysis On COVID-19 Clinical Trials¶

OBJECTIVES¶

Dataset Overview¶

Importing libraries and loading dataset¶

Data Cleaning and Validation¶

Handling Missing Data¶

Investigating Missingness in Cols Suspected to be due to MAR or MNAR¶

Classification and Decision on Missingness in Columns with missing values¶

Effecting Decision¶

Feature Engineering¶

Exploratory Data Analysis¶

Findings¶

1) Status Distribution & Trial Duration Status Trends¶

2) Demographic Inclusivity Gender:¶

3) Intervention Profiles & Phase Progression Intervention Types:¶

4) Enrollment Scale & Design Rigor Enrollment Scale:¶

5) Funding Landscape & Sponsors Dominant Sector:¶

Conclusion¶

Project FilesVisit my repo Github repo to access full project files

Project Files
Visit my repo Github repo to access full project files