Source code for iMaT.src.tokenization.refine_results.remove_prefixes

"""
Module: tokenization.refine_results.remove_prefixes.py
======================================================

This module, part of the `tokenization.refine_results` package, refines tokenized data in a CSV file by removing unwanted prefixes.

Functions
---------
- `corpus_tokenization_refine_data_remove_prefixes`: Handles a workflow for refining tokenized data by removing unwanted prefixes.

- `remove_prefixes_function`: Helper function used within `corpus_tokenization_refine_data_remove_prefixes` to refine a DataFrame.

Notes
-----
The module expects CSV files to have a specific structure, including a 'filename' column, and is designed to remove prefixes like 'Ignore_'.
Please refer to the individual function docstrings for more detailed descriptions and examples of usage.
"""
import re

import pandas as pd
from tqdm import tqdm

from iMaT.src.cli.menu_constructors import display_menu_print_results, display_menu_print_textblock, \
    display_menu_request_selection, util_convert_pd_dataframe_to_imat_datacont
from iMaT.src.tokenization.utils import save_data_to_new_csv_file, select_csv_file_2d_token_representation
from iMaT.src.utils.error_handling import handle_error


[docs]def corpus_tokenization_refine_data_remove_prefixes():
    """
    Refines CSV data by removing unwanted prefixes from the data.

    This function guides the user to select a CSV file, performs data refining operations to remove unwanted
    prefixes (specifically "Ignore_" prefixes and any prefixes that match column names), and displays a
    table with the results. The user then has an option to save the refined data into a new CSV file.

    Parameters: None

    Returns: None

    See Also
    --------
    select_csv_file_2d_token_representation : Opens a file dialog allowing the user to select a CSV file.
    remove_prefixes_function : Refines the pandas DataFrame by removing unwanted prefixes.
    """
    try:
        while True:
            file_name = select_csv_file_2d_token_representation()

            if file_name is None:
                break

            df = pd.read_csv(file_name)

            df = remove_prefixes_function(df)

            # Step 4: show the user the first 30 rows after executing step 2 and 3
            results_dict = util_convert_pd_dataframe_to_imat_datacont(df.head(30))
            display_menu_print_results(results_dict)

            # Step 5: ask the user whether he wants to save the new file
            yes_no_menu = {
                "menu_displayed_text": [
                    "Save Refined Data",
                    "Do you want to save the refined data to a new CSV file?",
                    "Please select your choice (1-2): ",
                    ["Choice", "Description"],
                ],
                "menu_entries": [
                    ["CONT: Save the new file", "Yes", "Yes, save the refined data to a new CSV file"],
                    ["DONT: Do not save the new file", "No", "No, do not save the refined data"],
                ]
            }

            save_input = display_menu_request_selection(yes_no_menu)

            if save_input.lower() == 'yes':
                new_file_path = save_data_to_new_csv_file(df, file_name, "no_prefixes_")

                textblock_dict_newfile = {
                    "menu_displayed_text": [
                        "-- New File Path --",
                        "Please read the following message:",
                        "<To continue, please press Enter>",
                        ["", "Message"],
                    ],
                    "menu_entries_text": [
                        ["New File Path", f"The refined data has been saved to a new CSV file: {new_file_path}"]
                    ]
                }

                display_menu_print_textblock(textblock_dict_newfile)
            break

    except Exception as e:
        handle_error(e)


[docs]def remove_prefixes_function(df):
    """
    Refines a pandas DataFrame by removing unwanted prefixes from the data.

    This function removes "Ignore_" prefixes from all entries in the DataFrame. Then, it iterates over each
    column in the DataFrame and removes any prefixes that match the column name.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to refine.

    Returns
    -------
    pandas.DataFrame
        The DataFrame with removed unwanted prefixes.
    """
    try:
        # tqdm adds a progress bar
        print("Removing 'Ignore_' prefixes...")
        df.replace("Ignore_", "", regex=True, inplace=True)

        # Adding tqdm in for loop for progress bar
        print("Removing column title prefixes...")
        for col in tqdm(df.columns):
            df[col] = df[col].apply(lambda x: re.sub(f'{col}_', '', str(x)))

        return df

    except Exception as e:
        handle_error(e)