# Functions prefixed with "tmnt" are treatments and must have those parameters:
# - src: absolute path to the file that will be processed (str)
# - dstdir: absolute path to the directory that should contain new files (str)
# Also, nothing should be returned.
import pandas as pd
# @TODO Maybe update functions using df.isin() for masks.
[docs]
def delete_clueless_rows(df, clueless_words=None, columns=None, inplace=False):
'''
Delete rows of a Pandas DataFrame based on the values in clueless_words.
Delete each row that have any empty cell or any cell that contain a
clueless_word. If columns argument is specified, only those columns
will be concerned.
PARAMETERS
----------
(pandas.DataFrame) df:
The DataFrame to filter.
(list<str>) clueless_words=None:
Any cell containing a word of the list is treated as empty.
(list<str>) columns=None:
Columns considered for the deletion.
(bool) inplace=False:
If True, the DataFrame will be modified in place, else it is returned.
RETURNS
-------
(pandas.DataFrame) df=None:
Filtered copy of passed df, if inplace==True returns None.
'''
if clueless_words is None:
clueless_words = []
if columns is None or len(columns) == 0:
columns = df.columns
# Create a mask that is True for rows that should be retained
mask = df[columns].applymap(
lambda x: x not in clueless_words and x is not None)
# If inplace is True, update the DataFrame in place
if inplace:
df.drop(df[~mask].index, inplace=True)
else: # Otherwise, return a filtered copy of the DataFrame
return df[mask]
# @TODO pd.read_table might be the same function (but better).
# https://pandas.pydata.org/docs/reference/api/pandas.read_table.html
[docs]
def read_df_from_any_avalaible_extensions(sheetpath):
'''
Load a Pandas DataFrame from a file.
Avalaible extensions: csv, txt, xls, xlsx, feather, parquet,
hdf5, sas7bdat, stata, pickle.
PARAMETERS
----------
(str) sheetpath:
Absolute path of the file to load.
RETURNS
-------
(pandas.DataFrame) df:
The DataFrame loaded from the file.
RAISES
------
(ValueError) err:
If the file extension is not supported.
'''
# Determine the file extension
_, file_extension = sheetpath.rsplit('.', 1)
# Load the DataFrame using the appropriate reader based on the file extension
if file_extension in ['csv', 'txt']:
return pd.read_csv(sheetpath)
elif file_extension in ['xls', 'xlsx']:
return pd.read_excel(sheetpath)
elif file_extension in ['feather', 'parquet', 'hdf5']:
return pd.read_feather(sheetpath)
elif file_extension == 'sas7bdat':
return pd.read_sas(sheetpath)
elif file_extension == 'stata':
return pd.read_stata(sheetpath)
elif file_extension == 'pickle':
return pd.read_pickle(sheetpath)
else:
raise ValueError(f'Unsupported file extension: {file_extension}')
[docs]
def add_suffix_to_cells_from_a_column(df, suffix, columns, inplace=False):
'''
Concatenate a string with the values in certain columns of a Pandas DataFrame.
PARAMETERS
----------
(pandas.DataFrame) df:
The DataFrame to modify.
(str) suffix:
The string to concatenate with the values in the specified columns.
(list) columns:
A list of column names to concatenate with the string.
(bool) inplace=False:
If True, the DataFrame will be modified in place, else it is returned.
RETURNS
-------
(pandas.DataFrame) df=None:
Modified copy of passed df, if inplace==True returns None.
'''
if inplace:
for col in columns:
df[col] = df[col].apply(lambda x: f'{x}{suffix}')
else:
df_modified = df.copy()
for col in columns:
df_modified[col] = df_modified[col].apply(lambda x: f'{x}{suffix}')
return df_modified