import pandas as pd
import os

# # Get the current directory where the script and CSV files are located
# folder_path = os.getcwd()

# # Create an empty list to store dataframes
# dataframes = []

# # Loop through all the CSV files in the folder and append to the list
# for filename in os.listdir(folder_path):
#     if filename.endswith('.csv'):
#         df = pd.read_csv(filename)
#         dataframes.append(df)

# # Concatenate all the dataframes into one
# merged_df = pd.concat(dataframes, ignore_index=True)

# # Write the merged dataframe to a new CSV file
# merged_df.to_csv('gene2phenotype.csv', index=False)


merged_df = pd.read_csv('gene2phenotype.csv')
merged_df['disease name'] = merged_df['disease name'].str.replace(' ', '_')

df = merged_df[['gene symbol',	'disease name',	'disease name']]
df.columns.values[2] = 'disease mim'
# df = df[df['disease mim'] != 'No disease mim']


# Create a dictionary to group unique disease mim's by gene symbol
grouped_genes = df.groupby('gene symbol')['disease mim'].apply(lambda x: ' '.join(sorted(set(x)))).reset_index()

# Define the output file paths
output_file_genes = 'gene2phenotype-map.txt'
output_file_disease = 'gene2phenotype-mapnames.txt'  # Changed name as requested

# Write the gene symbol and disease mim list to the first file
with open(output_file_genes, 'w') as f:
    f.write("#geneNS\tsym\n")  # Write the header
    for index, row in grouped_genes.iterrows():
        f.write(f"{row['gene symbol']}\t{row['disease mim']}\n")

# Write the disease mim and disease name to the second file
# Ensure unique disease mim entries with associated disease names
unique_disease_mim = df.drop_duplicates(subset=['disease mim'])
with open(output_file_disease, 'w') as f:
    for index, row in unique_disease_mim.iterrows():
        f.write(f"{row['disease mim']}\t{row['disease name']}\n")

print(f"Data successfully written to {output_file_genes} and {output_file_disease}")