Source code for iMaT.src.tokenization.refine_results.remove_prefixes

"""
Module: tokenization.refine_results.remove_prefixes.py
======================================================

This module, part of the `tokenization.refine_results` package, refines tokenized data in a CSV file by removing unwanted prefixes.

Functions
---------
- `corpus_tokenization_refine_data_remove_prefixes`: Handles a workflow for refining tokenized data by removing unwanted prefixes.

- `remove_prefixes_function`: Helper function used within `corpus_tokenization_refine_data_remove_prefixes` to refine a DataFrame.

Notes
-----
The module expects CSV files to have a specific structure, including a 'filename' column, and is designed to remove prefixes like 'Ignore_'.
Please refer to the individual function docstrings for more detailed descriptions and examples of usage.
"""
import re

import pandas as pd
from tqdm import tqdm

from iMaT.src.cli.menu_constructors import display_menu_print_results, display_menu_print_textblock, \
    display_menu_request_selection, util_convert_pd_dataframe_to_imat_datacont
from iMaT.src.tokenization.utils import save_data_to_new_csv_file, select_csv_file_2d_token_representation
from iMaT.src.utils.error_handling import handle_error


[docs]def corpus_tokenization_refine_data_remove_prefixes(): """ Refines CSV data by removing unwanted prefixes from the data. This function guides the user to select a CSV file, performs data refining operations to remove unwanted prefixes (specifically "Ignore_" prefixes and any prefixes that match column names), and displays a table with the results. The user then has an option to save the refined data into a new CSV file. Parameters: None Returns: None See Also -------- select_csv_file_2d_token_representation : Opens a file dialog allowing the user to select a CSV file. remove_prefixes_function : Refines the pandas DataFrame by removing unwanted prefixes. """ try: while True: file_name = select_csv_file_2d_token_representation() if file_name is None: break df = pd.read_csv(file_name) df = remove_prefixes_function(df) # Step 4: show the user the first 30 rows after executing step 2 and 3 results_dict = util_convert_pd_dataframe_to_imat_datacont(df.head(30)) display_menu_print_results(results_dict) # Step 5: ask the user whether he wants to save the new file yes_no_menu = { "menu_displayed_text": [ "Save Refined Data", "Do you want to save the refined data to a new CSV file?", "Please select your choice (1-2): ", ["Choice", "Description"], ], "menu_entries": [ ["CONT: Save the new file", "Yes", "Yes, save the refined data to a new CSV file"], ["DONT: Do not save the new file", "No", "No, do not save the refined data"], ] } save_input = display_menu_request_selection(yes_no_menu) if save_input.lower() == 'yes': new_file_path = save_data_to_new_csv_file(df, file_name, "no_prefixes_") textblock_dict_newfile = { "menu_displayed_text": [ "-- New File Path --", "Please read the following message:", "<To continue, please press Enter>", ["", "Message"], ], "menu_entries_text": [ ["New File Path", f"The refined data has been saved to a new CSV file: {new_file_path}"] ] } display_menu_print_textblock(textblock_dict_newfile) break except Exception as e: handle_error(e)
[docs]def remove_prefixes_function(df): """ Refines a pandas DataFrame by removing unwanted prefixes from the data. This function removes "Ignore_" prefixes from all entries in the DataFrame. Then, it iterates over each column in the DataFrame and removes any prefixes that match the column name. Parameters ---------- df : pandas.DataFrame The DataFrame to refine. Returns ------- pandas.DataFrame The DataFrame with removed unwanted prefixes. """ try: # tqdm adds a progress bar print("Removing 'Ignore_' prefixes...") df.replace("Ignore_", "", regex=True, inplace=True) # Adding tqdm in for loop for progress bar print("Removing column title prefixes...") for col in tqdm(df.columns): df[col] = df[col].apply(lambda x: re.sub(f'{col}_', '', str(x))) return df except Exception as e: handle_error(e)