import pandas as pd

# learn PFAM id to PFAM name mapping
pfam_data = pd.read_excel('Pfam-A.clans.xlsx', header=None)

pfam_names = pfam_data.iloc[:, [3, 4]]
# pfam_names.to_csv('gene_domainsnames.txt')
pfam_names.to_csv('domain-mapnames.txt', sep='\t', index=False, header=False)


pfam_data = pfam_data.iloc[:, [0, 3]]
pfam_data.columns = ['PFAMid', 'PFAMname']
pfam_mapping = pfam_data.set_index('PFAMid')['PFAMname'].to_dict()


# load PFAM id - gene name table
df = pd.read_excel('mart_export.xlsx')
df.dropna(subset=['Pfam ID', 'Gene name'], inplace=True)
# Replace Pfam IDs with Pfam domain names
df["Pfam ID"] = df["Pfam ID"].map(pfam_mapping)


# Group by 'Gene name' and aggregate 'Pfam ID' into a tab-separated list
df_grouped = df.groupby('Gene name')['Pfam ID'].apply(lambda x: '\t'.join(str(v) for v in x if pd.notna(v))).reset_index()
df_grouped.columns = ['Gene name', 'Domain names']

# Save to TSV file
tsv_file = 'domain-map.txt'
df_grouped.to_csv(tsv_file, sep='\t', index=False)


# Read the file and replace double quotes with spaces
with open('domain-map.txt', 'r') as file:
    content = file.read().replace('"', ' ')

# Write the modified content back to a new TSV file
with open('domain-map.txt', 'w') as file:
    file.write(content)
    
