#!/usr/bin/env python3 """ This script processes multiple timing.csv files located in the current directory and its subdirectories, performs various operations on their contents, and generates a merged CSV file. The main steps performed by this script are: 1. Locate all timing.csv files in the current folder and its subfolders. 2. Load each CSV file into a DataFrame, keeping only the "Timer" and "Mean Time" columns. 3. Replace row names (index) ending with '_compiled', '_jax', or '_numpy' with '_kernel' in the individual DataFrames. 4. Merge all DataFrames using the intersection of all row names (Timer column) and keeping only the "Mean Time" columns, which are renamed to the name of each CSV file's containing folder. 5. Save the merged DataFrame to a new CSV file called merged_timings.csv in the current folder. 6. Similarly produce a merged_kernels_timing.csv file with the sum of the runtime for each kernel type, keeping only kernels that are GPU or for which more than one implementation was used To use this script, simply run it in a Python environment with the required dependency (pandas) installed. The script will automatically locate timing.csv files, process them, generate the merged DataFrame and save the merged_timings.csv file. """ import glob import os import re import numpy as np import pandas as pd def find_csv_files(folder, file_pattern): """ Find all CSV files matching the specified pattern in the given folder and its subfolders. :param folder: The root folder to start searching for CSV files. :param file_pattern: The pattern of the CSV files to search for. :return: A list of file paths matching the specified pattern. """ return glob.glob(os.path.join(folder, file_pattern), recursive=True) def process_timer_path(s): """ Process the input string and return three strings: simplified path, kernel type, and operation name. :param s: Input string to process. :return: A tuple containing the simplified path, kernel type, and operation name. """ # Remove occurrences of '(function) ' and '._exec' simplified_path = s.replace("(function) ", "").replace("._exec", "") # Determine kernel type if s.endswith("_jax"): kernel_type = "JAX" simplified_path = simplified_path.replace("_jax", "") elif s.endswith("_compiled"): kernel_type = "COMPILED" simplified_path = simplified_path.replace("_compiled", "") elif s.endswith("_numpy"): kernel_type = "NUMPY" simplified_path = simplified_path.replace("_numpy", "") elif ("accel_data" in simplified_path) or ("INTERVALS_JAX" in simplified_path): kernel_type = "DATA_MOVEMENT" elif "|dispatch|" in simplified_path: kernel_type = "DEFAULT" else: kernel_type = None # Extract operation name operation_name = simplified_path.split("|")[-1] if kernel_type == "DATA_MOVEMENT": # Name clean-up specific to Jax data movement operations if operation_name == "INTERVALS_JAX.__init__": operation_name = "accel_data_update_device" elif operation_name == "INTERVALS_JAX.to_host": operation_name = "accel_data_update_host" # Add '_kernel' at the end of simplified_path if it contains '|dispatch|' if (kernel_type is not None) and (kernel_type != "DATA_MOVEMENT"): simplified_path += "_kernel" return simplified_path, kernel_type, operation_name def load_csv_files(file_paths): """ Load CSV files from the given file paths, keeping only the specified columns and setting the index to the Timer column. :param file_paths: A list of file paths to load. :return: A list of DataFrames containing the loaded DataFrames. """ dataframes = list() for file_path in file_paths: folder_name = os.path.basename(os.path.dirname(file_path)) df = pd.read_csv(file_path, index_col="Timer", usecols=["Timer", "Mean Time"]) # Process the index and extract simplified_path, kernel_type, and operation_name processed_indices = list(map(process_timer_path, df.index)) simplified_paths, kernel_types, operation_names = zip( *processed_indices ) # unzip # Replace index with simplified_path df.index = simplified_paths # Add kernel_type and operation_name columns df["kernel_type"] = kernel_types df["operation_name"] = operation_names # Rename the 'Mean Time' column to the folder name df = df.rename(columns={"Mean Time": folder_name}) dataframes.append(df) return dataframes def combine_kernel_types(k1, k2): """ Combine two kernel types into a single kernel type, returning 'MULTIPLE' if they differ. :param k1: The first kernel type (string or None). :param k2: The second kernel type (string or None). :return: The combined kernel type (string). """ if (k1 is None) or (isinstance(k1, float) and np.isnan(k1)): return k2 if (k2 is None) or (isinstance(k2, float) and np.isnan(k2)): return k1 if k1 == k2: return k1 return "MULTIPLE" def merge_dataframes(dataframes): """ Merge the given DataFrames using the union of all row names and keeping only the specified columns. The columns are intermeshed as best as possible based on their original order in the input CSV files. :param dataframes: A dictionary of DataFrames to merge. :return: A merged DataFrame with the union of all row names. """ # Initialize the merged DataFrame with the first DataFrame merged_df = dataframes[0] # Merge the remaining DataFrames one-by-one for df in dataframes[1:]: merged_df = pd.merge( merged_df, df, left_index=True, right_index=True, how="outer", suffixes=("_1", "_2"), ) # Combine kernel_type and operation_name columns merged_df["kernel_type"] = merged_df["kernel_type_1"].combine( merged_df["kernel_type_2"], combine_kernel_types ) merged_df["operation_name"] = merged_df["operation_name_1"].combine_first( merged_df["operation_name_2"] ) # Drop extra kernel_type and operation_name columns merged_df.drop( columns=[ "kernel_type_1", "kernel_type_2", "operation_name_1", "operation_name_2", ], inplace=True, ) return merged_df def merge_kernel_rows(df): """ Filters the input DataFrame to keep only rows where the `kernel_type` column is 'SEVERAL', drops the `kernel_type` column, and groups the DataFrame by the `operation_name` column, summing the values of rows that get collapsed together. :param df: Input DataFrame to filter and group. :return: A filtered and grouped DataFrame. """ # Filter DataFrame to keep only rows where kernel_type has some GPU computation df_filtered = df[df["kernel_type"].isin(["MULTIPLE", "DATA_MOVEMENT", "JAX"])] # df_filtered = df[~df['kernel_type'].isin(['NUMPY', 'DEFAULT', None])] # TODO switch to this version once compiled becomes common # Drop the kernel_type column df_filtered = df_filtered.drop(columns=["kernel_type"]) # Group by operation_name and sum the values of rows that get collapsed together # Some columns contains None/nan, hence the need for numeric_only=False df_grouped = df_filtered.groupby("operation_name").apply( lambda x: x.sum(numeric_only=False) ) # Remove additional operation_name column added by the non-numeric-only summation df_grouped.drop(columns=["operation_name"], inplace=True) return df_grouped if __name__ == "__main__": folder = "." file_pattern = "**/timing.csv" csv_file_paths = find_csv_files(folder, file_pattern) dataframes = load_csv_files(csv_file_paths) merged_df = merge_dataframes(dataframes) merged_kernel_df = merge_kernel_rows(merged_df) # Save the merged kernel data to a CSV file output_kernels_file = "merged_kernels_timings.csv" merged_kernel_df.to_csv(output_kernels_file) print(f"Merged kernel data saved to '{output_kernels_file}'") # Save the merged DataFrame to a CSV file merged_df = merged_df.drop(columns=["kernel_type", "operation_name"]) output_file = "merged_timings.csv" merged_df.to_csv(output_file) print(f"Merged data saved to '{output_file}'")