import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV file into a DataFrame
df = pd.read_csv('supplementary_dataset_11_full_constraint_metrics.tsv.gz', sep='\t', compression='gzip')

df = df[df['canonical'] == True]

cols_to_include = ['oe_lof_upper']

# Create a new DataFrame with selected columns
new_df = df[['gene'] + cols_to_include]

for col in cols_to_include:
    new_df.loc[:, col + '_rank'] = new_df[col].rank()


new_df.rename(columns={'oe_lof_upper': 'LOEUF'}, inplace=True)
new_df.rename(columns={'oe_lof_upper_rank': 'LOEUF_rank'}, inplace=True)

# Display the new DataFrame
print(new_df.head())

new_df.to_csv('LOEUF_scores.csv.gz', index = 0)


# oe_lof_values = new_df['LOEUF']

# max_value = oe_lof_values.max()
# # Print the maximum and minimum values
# print("Maximum value of 'oe_lof' column:", max_value)


## some genes with high intolerance by RVIS
# genenow = 'CHD8'
# genenow = 'LRP1'
# genenow = 'DYNC1H1'
# # some genes with low intolerance by RVIS
# genenow = 'MKI67'
# genenow = 'FLG'
# gene_row = new_df[new_df['gene'] == genenow]
# # Display the row
# print(gene_row)