"""
Module: tokenization.main.py
============================
This module provides functions for tokenizing MIDI files in a selected directory. These files are converted into token sequences
using one or more tokenizers and saved as CSV files in a subdirectory of the original directory.
Functions
---------
- `tokenization_tokenize_folder_midi_files`: Tokenizes all MIDI files in a user-selected directory using selected tokenizers.
- `select_miditok_tokenizer`: Presents a user interface for tokenizer selection.
- `select_folder_midi_files`: Presents a user interface for folder selection.
- `tokenize_midi_file`: Tokenizes a MIDI file with a specified tokenizer class and saves the tokenized sequence to a CSV file.
- `extract_tokens_from_token_string_within_list`: Extracts tokens from a string representation of a list using regex.
- `extract_tokens_from_token_string_within_nested_list`: Extracts tokens from a string representation of a nested list using regex and abstract syntax trees (AST).
Notes
-----
Functions in this module can be used directly or through the CLI interface provided by the `cli.cli_menu_structure` module.
The tokenized sequences are saved as CSV files in a subdirectory of the original directory.
Please refer to the individual function docstrings for more detailed descriptions and examples of usage.
"""
import ast
import datetime
import os
import re
import tkinter as tk
from tkinter import filedialog
import pandas as pd
from miditok import CPWord, MIDILike, MuMIDI, Octuple, OctupleMono, REMI, REMIPlus, Structured, TSD
from openpyxl.workbook import Workbook
from tqdm import tqdm
from iMaT.src.cli.menu_constructors import display_menu_print_textblock, \
display_menu_request_selection
from iMaT.src.tokenization.utils import combine_csv_files_in_directory, create_log_entry, display_success_rate, \
display_tokenizable_files_in_folder, \
get_tokenizable_files_in_folder
from iMaT.src.utils.error_handling import handle_error
MIDITOK_TOKENIZERS_LIST = [
['REMI', REMI, '<One-Dimensional: Event-based, includes timing, bar info>'],
['REMIPlus', REMIPlus, '<One-Dimensional: Extended REMI, multi-track, multi-signature>'],
['MIDILike', MIDILike, '<One-Dimensional: Converts MIDI messages to tokens>'],
['TSD', TSD, '<One-Dimensional: Similar to MIDI-Like, uses explicit Duration tokens>'],
['Structured', Structured, '<One-Dimensional: Similar to TSD, consistent token type succession>'],
['CPWord', CPWord, '<Two-Dimensional: Uses embedding pooling to reduce sequence length>'],
['Octuple', Octuple, '<Two-Dimensional: Embedding pooling, single note representation>'],
['OctupleMono', OctupleMono, '<Two-Dimensional: Like Octuple, but suited for one track>'],
['MuMIDI', MuMIDI, '<Two-Dimensional: Multitrack tasks, uses embedding pooling>']
]
MIDITOK_TOKENIZERS_HEADERS = {
'REMI': ['Tokens'],
'REMIPlus': ['Tokens'],
'MIDILike': ['Tokens'],
'TSD': ['Tokens'],
'Structured': ['Tokens'],
'CPWord': ['Family', 'Position', 'Pitch', 'Velocity', 'Duration', 'Rest'],
'Octuple': ['Pitch', 'Velocity', 'Duration', 'Program', 'Position', 'Bar'],
'OctupleMono': ['Pitch', 'Velocity', 'Duration', 'Position', 'Bar'],
'MuMIDI': ['Type*', 'BarPosEnc', 'PositionPosEnc', 'Velocity', 'Duration']
}
MIDITOK_TOKENIZERS_ADDITIONAL_TOKENS = {
'Chord': False,
'Program': False,
'Rest': True,
'Tempo': False,
'TimeSignature': False,
'chord_maps': {
'7aug': (0, 4, 8, 11),
'7dim': (0, 3, 6, 9),
'7dom': (0, 4, 7, 10),
'7halfdim': (0, 3, 6, 10),
'7maj': (0, 4, 7, 11),
'7min': (0, 3, 7, 10),
'9maj': (0, 4, 7, 10, 14),
'9min': (0, 4, 7, 10, 13),
'aug': (0, 4, 8),
'dim': (0, 3, 6),
'maj': (0, 4, 7),
'min': (0, 3, 7),
'sus2': (0, 2, 7),
'sus4': (0, 5, 7)
},
'chord_tokens_with_root_note': True,
'chord_unknown': False,
'nb_tempos': 32,
'programs': list(range(-1, 128)),
'rest_range': (2, 16),
'tempo_range': (40, 250),
'time_signature_range': (8, 2)
}
[docs]def tokenization_tokenize_folder_midi_files():
"""
Tokenizes all MIDI files in a user-selected directory using the selected tokenizers.
This function prompts the user to select a directory and one or more tokenizers.
After selection, the function tokenizes all MIDI files in the directory using the
chosen tokenizers. The tokenized sequences are saved as CSV files in a subdirectory
of the original directory named "tokenized_YYYYMMDD_HHMMSS". Additionally, any tokenization
results and errors are logged in an Excel file named "tokenization_log.xlsx" in the
same subdirectory.
Parameters
----------
None
Returns
-------
None
See Also
--------
select_folder_midi_files : Function to interact with user for selecting a folder.
display_tokenizable_files_in_folder : Function to display tokenizable files.
select_miditok_tokenizer : Function to interact with user for selecting tokenizer(s).
"""
folder_path = select_folder_midi_files()
if folder_path is None:
return
tokenizable_files = display_tokenizable_files_in_folder(folder_path)
selected_tokenizers = select_miditok_tokenizer()
tokenization_dir = os.path.join(folder_path, "tokenized_" + datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
os.mkdir(tokenization_dir)
workbook = Workbook()
workbook.save(tokenization_dir + r'\tokenization_log.xlsx')
num_files_tokenized = len(tokenizable_files) * len(selected_tokenizers)
num_tokenized = 0
files_status = []
for tokenizer_class in selected_tokenizers:
# Look up the name of the tokenizer
tokenizer_name = next((x[0] for x in MIDITOK_TOKENIZERS_LIST if x[1] == tokenizer_class), None)
if not tokenizer_name:
print(f"Error: Could not find tokenizer '{tokenizer_name}'")
continue # Skip if the tokenizer name is not found
if tokenizer_name not in ['Structured', 'REMIPlus']:
tokenizer = tokenizer_class(additional_tokens=MIDITOK_TOKENIZERS_ADDITIONAL_TOKENS, beat_res={(0, 16): 16})
else:
tokenizer = tokenizer_class()
tokenized_file_folder = os.path.join(tokenization_dir, f"tokenizer_{tokenizer_name}")
os.mkdir(tokenized_file_folder)
for file in tqdm(tokenizable_files, ncols=70): # Wrap tokenizable_files with tqdm for progress bar
try:
midi_file_path = os.path.join(folder_path, file)
tokens = tokenizer(midi_file_path)
tokens_string = str(tokens)
# check if the tokens are in a nested list
if "[[" in tokens_string and "]]" in tokens_string:
tokens_list = extract_tokens_from_token_string_within_nested_list(tokens_string)
else:
tokens_list = extract_tokens_from_token_string_within_list(tokens_string)
# Convert the list of tokens to a DataFrame
df = pd.DataFrame(tokens_list)
# Get the column names for the current tokenizer
headers = MIDITOK_TOKENIZERS_HEADERS.get(tokenizer_name, [])
# Check if the DataFrame has the right number of columns
if len(df.columns) == len(headers):
df.columns = headers
else:
print(
f'\nWarning: Number of columns in the DataFrame does not match the number of headers for '
f'tokenizer {tokenizer_name}. Defaulting to generic headers.\n')
df.columns = ['column' + str(i) for i in range(1, len(df.columns) + 1)]
# Save the DataFrame to a CSV file
base_name = os.path.splitext(file)[0] # Get file base name without extension
file_name_token = f"{base_name}_tokenizer_{tokenizer_name}_tokens.csv"
file_path = os.path.join(tokenized_file_folder, file_name_token)
df.to_csv(file_path, index=False)
num_tokenized += 1
files_status.append([tokenizer_name, file, "<successfully converted>"])
except Exception as e:
files_status.append([tokenizer_name, file, str(e)])
exception_list = [tokenizer_name, os.path.join(tokenized_file_folder, file), str(e)]
create_log_entry(exception_list, tokenization_dir + r'\tokenization_log.xlsx')
combine_csv_files_in_directory(tokenized_file_folder, f"00_combined_tokenizer_{tokenizer_name}_tokens.csv")
display_success_rate(files_status, num_files_tokenized, num_tokenized)
[docs]def select_miditok_tokenizer():
"""
Interacts with the user via a user interface for tokenizer selection.
This function prompts the user to select one or more tokenizers from a predefined list.
Depending on the user's choice, it returns the selected tokenizers.
Parameters
----------
None
Returns
-------
list
A list containing the selected tokenizer(s) classes.
Raises
------
Exception
If an error occurs while processing the user's selection.
See Also
--------
display_menu_request_selection : Function to display menu and request user selection.
"""
try:
tokenizer_dict = {
"menu_displayed_text": [
"File Tokenization: Tokenizer Selection",
"Please select one of the following tokenizers by entering the corresponding index number:",
"Which tokenizer do you want to select? (<No. of menu item>): ",
["Tokenizer", "Description"]
],
"menu_entries": [[tokenizer[0], tokenizer[1], tokenizer[2]] for tokenizer in MIDITOK_TOKENIZERS_LIST] + [
["All", "return_all", "Apply all tokenizers"]]
}
choice = display_menu_request_selection(tokenizer_dict)
if choice == "return_all":
return [tokenizer[1] for tokenizer in MIDITOK_TOKENIZERS_LIST]
else:
return [choice]
except Exception as e:
handle_error(e)
[docs]def select_folder_midi_files():
"""
Interacts with the user via a user interface for folder selection.
This function prompts the user to input the path to the desired folder. It validates
the input and ensures the folder contains parsable music files. If not, it prompts the
user to input another path.
Parameters
----------
None
Returns
-------
str
A string containing the path to the selected folder, or None if the user cancelled
the dialog or didn't select a suitable folder.
Raises
------
Exception
If an error occurs while processing the user's selection.
See Also
--------
get_tokenizable_files_in_folder : Function to get tokenizable files in the selected folder.
display_menu_print_textblock : Function to display menu.
"""
try:
parsable_extensions = ['.midi', '.mid']
while True:
textblock_dict = {
"menu_displayed_text": [
"-- Folder Selection --",
"Please read the following information:",
"<To continue and select a folder, please press Enter>",
["", "Information"],
],
"menu_entries_text": [
["Parsable Music File Types",
", ".join(parsable_extensions)],
["Folder Requirements",
"The folder must contain at least one parsable music file of the above types to be processed by this tool."],
["Return to Main Menu",
"If you do not have a suitable folder with valid files at the moment, press enter and close the selection display that will be displayed right after.\n"
"You will then be returned to the Main Menu."]
]
}
display_menu_print_textblock(textblock_dict)
root = tk.Tk()
# root.withdraw() # Hide the root window
# Open a file dialog and get the selected file path
folder_path = filedialog.askdirectory()
root.destroy() # Destroy the root window
if folder_path == "": # If the user canceled the dialog
return None
if len(get_tokenizable_files_in_folder(folder_path)) == 0:
error_message_dict = {
"menu_displayed_text": [
"File Conversion: Folder Selection - Error",
"The selected directory contains no parsable music files:",
"<To continue, please press Enter>",
["", "Troubleshooting assistance:"]
],
"menu_entries_text": [
["Parsable Music File Types",
", ".join(parsable_extensions)],
["Reason 1:", "There are no parsable music files in the selected directory."],
["Reason 2:", "Ensure the directory you select contains the music files you want to convert."],
["Return to Main Menu",
"If you do not have a suitable folder with valid files at the moment, press enter and close the "
"selection display that will be displayed right after.\n"
"You will then be returned to the Main Menu."]
]
}
display_menu_print_textblock(error_message_dict)
continue
else:
return folder_path
except Exception as e:
handle_error(e)
[docs]def tokenize_midi_file(midi_file_path, tokenizer_class, tokenized_file_folder):
"""
Tokenizes a MIDI file using a specified tokenizer class and saves the tokenized sequence to a
CSV file. It handles both single-layer and nested token lists.
Parameters
----------
midi_file_path : str
Path to the MIDI file to be tokenized.
tokenizer_class : class
Class of the tokenizer to be used for tokenizing the MIDI file.
tokenized_file_folder : str
Path to the folder where the CSV file containing the tokenized sequence will be saved.
Returns
-------
None
Raises
------
Exception
If an error occurs during tokenization or saving the tokenized sequence.
See Also
--------
extract_tokens_from_token_string_within_list : Function to extract tokens from a list.
extract_tokens_from_token_string_within_nested_list : Function to extract tokens from a nested list.
"""
try:
# create the tokenizer and convert the MIDI to tokens
tokenizer = tokenizer_class()
tokens = tokenizer(midi_file_path)
tokens_string = str(tokens)
# check if the tokens are in a nested list
if "[[" in tokens_string and "]]" in tokens_string:
tokens_list = extract_tokens_from_token_string_within_nested_list(tokens_string)
else:
tokens_list = extract_tokens_from_token_string_within_list(tokens_string)
# convert the list of tokens to a DataFrame
df = pd.DataFrame(tokens_list)
# save the DataFrame to a CSV file
file_name_token = f"{tokenizer_class.__name__}_tokens.csv"
file_path = os.path.join(tokenized_file_folder, file_name_token)
df.to_csv(file_path, index=False, header=False)
except Exception as e:
handle_error(e)