Source code for acutils.sheet

# Functions prefixed with "tmnt" are treatments and must have those parameters:
#  - src: absolute path to the file that will be processed (str)
#  - dstdir: absolute path to the directory that should contain new files (str)
# Also, nothing should be returned.

import pandas as pd



# @TODO Maybe update functions using df.isin() for masks.
[docs] def delete_clueless_rows(df, clueless_words=None, columns=None, inplace=False): ''' Delete rows of a Pandas DataFrame based on the values in clueless_words. Delete each row that have any empty cell or any cell that contain a clueless_word. If columns argument is specified, only those columns will be concerned. PARAMETERS ---------- (pandas.DataFrame) df: The DataFrame to filter. (list<str>) clueless_words=None: Any cell containing a word of the list is treated as empty. (list<str>) columns=None: Columns considered for the deletion. (bool) inplace=False: If True, the DataFrame will be modified in place, else it is returned. RETURNS ------- (pandas.DataFrame) df=None: Filtered copy of passed df, if inplace==True returns None. ''' if clueless_words is None: clueless_words = [] if columns is None or len(columns) == 0: columns = df.columns # Create a mask that is True for rows that should be retained mask = df[columns].applymap( lambda x: x not in clueless_words and x is not None) # If inplace is True, update the DataFrame in place if inplace: df.drop(df[~mask].index, inplace=True) else: # Otherwise, return a filtered copy of the DataFrame return df[mask]
# @TODO pd.read_table might be the same function (but better). # https://pandas.pydata.org/docs/reference/api/pandas.read_table.html
[docs] def read_df_from_any_avalaible_extensions(sheetpath): ''' Load a Pandas DataFrame from a file. Avalaible extensions: csv, txt, xls, xlsx, feather, parquet, hdf5, sas7bdat, stata, pickle. PARAMETERS ---------- (str) sheetpath: Absolute path of the file to load. RETURNS ------- (pandas.DataFrame) df: The DataFrame loaded from the file. RAISES ------ (ValueError) err: If the file extension is not supported. ''' # Determine the file extension _, file_extension = sheetpath.rsplit('.', 1) # Load the DataFrame using the appropriate reader based on the file extension if file_extension in ['csv', 'txt']: return pd.read_csv(sheetpath) elif file_extension in ['xls', 'xlsx']: return pd.read_excel(sheetpath) elif file_extension in ['feather', 'parquet', 'hdf5']: return pd.read_feather(sheetpath) elif file_extension == 'sas7bdat': return pd.read_sas(sheetpath) elif file_extension == 'stata': return pd.read_stata(sheetpath) elif file_extension == 'pickle': return pd.read_pickle(sheetpath) else: raise ValueError(f'Unsupported file extension: {file_extension}')
[docs] def add_suffix_to_cells_from_a_column(df, suffix, columns, inplace=False): ''' Concatenate a string with the values in certain columns of a Pandas DataFrame. PARAMETERS ---------- (pandas.DataFrame) df: The DataFrame to modify. (str) suffix: The string to concatenate with the values in the specified columns. (list) columns: A list of column names to concatenate with the string. (bool) inplace=False: If True, the DataFrame will be modified in place, else it is returned. RETURNS ------- (pandas.DataFrame) df=None: Modified copy of passed df, if inplace==True returns None. ''' if inplace: for col in columns: df[col] = df[col].apply(lambda x: f'{x}{suffix}') else: df_modified = df.copy() for col in columns: df_modified[col] = df_modified[col].apply(lambda x: f'{x}{suffix}') return df_modified