Source code for iMaT.src.tokenization.refine_results.tokens_to_txt

"""
Module: tokenization.refine_results.tokens_to_txt.py
====================================================

This module, a part of the `tokenization.refine_results` package, handles the conversion of tokenized data from a CSV file
to individual text files.

Functions
---------
- `tokenization_export_csv_columns_to_txt_file`: Exports a CSV file's contents to individual text files, with directories
  for each column.

- `save_txt_files_to_directory`: Saves data from a dictionary into text files, organizing the files into directories based on keys.

Notes
-----
The module expects CSV files to have a specific structure, including a 'filename' column.
Please refer to the individual function docstrings for more detailed descriptions and examples of usage.
"""
import os
from datetime import datetime

import pandas as pd
from tqdm import tqdm

from iMaT.src.tokenization.utils import select_csv_file_2d_token_representation
from iMaT.src.utils.error_handling import handle_error

tokenizers_available_for_refining = ['CPWord', 'Octuple', 'OctupleMono', 'MuMIDI']

[docs]def tokenization_export_csv_columns_to_txt_file():
    """
    Exports the columns of a CSV file to individual text files.

    This function asks the user to select a CSV file, and then groups the DataFrame by filename. For each group,
    it concatenates the values of each column into a string. Finally, it saves each column's concatenated string
    into individual text files in directories named after each column. These directories are then bundled into
    a single directory named 'extracted_data_[current date and time]'.

    Parameters: None

    Returns: None

    See Also
    --------
    select_csv_file_2d_token_representation : Opens a file dialog allowing the user to select a CSV file.
    save_txt_files_to_directory : Saves the refined data into text files in directories named after each column.
    """
    try:
        file_path = select_csv_file_2d_token_representation()

        if file_path is None:
            return

        df = pd.read_csv(file_path)

        # Initialize a dictionary to hold grouped column data
        grouped_data = {}

        # Check if the DataFrame contains the 'filename' column
        if 'filename' in df.columns:

            # Group the data by filename
            grouped = df.groupby('filename')

            # Loop through each group
            for name, group in tqdm(grouped, desc='Grouping data', unit='group'):
                column_data = {}

                # Loop over the remaining columns in the group and concatenate the values into a string
                for column in group.columns:
                    if column != 'filename':
                        column_data[column] = ' '.join(group[column].astype(str).values)

                # Save the column data for the current group in the grouped_data dictionary
                grouped_data[name] = column_data

            # Save the data to the new text files
            save_txt_files_to_directory(grouped_data, file_path)
        else:
            print("'filename' column does not exist in the DataFrame.")

    except Exception as e:
        handle_error(e)


[docs]def save_txt_files_to_directory(data, file_path):
    """
    Saves a dictionary of data into text files in directories named after each key.

    Each key-value pair in the data dictionary represents a filename and its associated data respectively.
    For each filename, the function creates a directory and within that directory, it creates a text file for
    each data column and writes the corresponding data into it. These directories are then bundled into a
    single directory named 'extracted_data_[current date and time]'.

    Parameters
    ----------
    data : dict
        The dictionary of data to be saved. Each key-value pair represents a filename and its associated data.
    file_path : str
        The original file path used to generate the new directory's name.

    Returns
    -------
    str
        The path to the newly created directory.

    See Also
    --------
    os.path.dirname : Returns the directory component of a pathname.
    os.makedirs : Recursively creates directories.
    """
    try:
        folder_path = os.path.dirname(file_path)  # Get the directory path of the file
        cleaned_csv_dir = os.path.join(folder_path, "extracted_data_" + datetime.now().strftime("%Y%m%d_%H%M%S"))

        # Create new directory if it does not exist
        os.makedirs(cleaned_csv_dir, exist_ok=True)

        # Loop over the data
        for filename, column_data in tqdm(data.items(), desc='Writing to file', unit='file'):

            # Loop over each column data
            for column_name, data in column_data.items():
                # Create a new directory for the column
                new_dir_path = os.path.join(cleaned_csv_dir, column_name)
                os.makedirs(new_dir_path, exist_ok=True)

                # Define the output file path
                output_file_path = os.path.join(new_dir_path, f"{filename}_{column_name}.txt")

                # Write the data to the output file
                with open(output_file_path, 'w') as output_file:
                    output_file.write(data)

        return cleaned_csv_dir

    except Exception as e:
        handle_error(e)