import pandas as pd

# Define the file path
file_path = 'goa_human.gaf.gz'

# Read the file into a DataFrame, skipping lines starting with '!'
df = pd.read_csv(file_path, comment='!', sep='\t', header=None)

# Filter rows where column 1 equals 'UniProtKB'
df_filtered = df[df[0] == 'UniProtKB']

# Extract the gene names (column 2) and GO terms (column 4)
go_map = df_filtered[[2, 4]]

# Rename columns for better readability
go_map.columns = ['Gene', 'GO_term']

# Group by gene name and aggregate GO terms into a list, separated by tabs
# grouped_go_map = go_map.groupby('Gene')['GO_term'].apply(lambda x: '\t'.join(set(x))).reset_index()
grouped_go_map = go_map.groupby('Gene')['GO_term'].apply(lambda x: ' '.join(set(x))).reset_index()

# Rename the columns to match the desired output format
grouped_go_map.columns = ['#geneNS', 'sym']

# Save the result to a new file
grouped_go_map.to_csv('GO-map.txt', sep='\t', header=True, index=False)

# Read the file and replace double quotes with spaces
with open('GO-map.txt', 'r') as file:
    content = file.read().replace('"', ' ')

# Write the modified content back to a new TSV file
with open('GO-map.txt', 'w') as file:
    file.write(content)
    
import re

# Define the file path
file_path = 'go_ontology.obo'
output_file_path = 'GO-mapnames.txt'

# Lists to store extracted ids and names
ids = []
names = []

# Read the file and extract id and name
with open(file_path, 'r') as file:
    content = file.read()
    
    # Find all terms in the file
    terms = re.findall(r'\[Term\](.*?)(?=\[Term\]|\Z)', content, re.DOTALL)
    
    for term in terms:
        # Extract id and name from the term
        id_match = re.search(r'id:\s*(GO:\d+)', term)
        name_match = re.search(r'name:\s*(.+)', term)
        
        if id_match and name_match:
            ids.append(id_match.group(1))
            names.append(name_match.group(1).replace(' ', '_'))

# Write the data to GO-mapnames.txt
with open(output_file_path, 'w') as file:

    
    # Write the extracted ids and names
    for id_, name in zip(ids, names):
        # file.write(f"{id_:<15} {name}\n")
        file.write(f"{id_}\t{name}\n")

print(f"Data has been written to {output_file_path}")