Source code for ankipandas.util.dataframe
""" DataFrame utilities. """
# std
from __future__ import annotations
# 3rd
import pandas as pd
# ours
from ankipandas.util.log import log
def _sync_metadata(df_ret: pd.DataFrame, df_old: pd.DataFrame) -> None:
"""
If the df_old has a `_metadata` field, containing a list of attribute
names that contain metadata, then this is copied from `df_old` to the new
dataframe `df_ret.
Args:
df_ret:
df_old:
Returns:
None
"""
if hasattr(df_old, "_metadata"):
for key in df_old._metadata:
value = getattr(df_old, key)
log.debug("Setting metadata attribute %s to %s", key, value)
setattr(df_ret, key, value)
[docs]
def replace_df_inplace(df: pd.DataFrame, df_new: pd.DataFrame) -> None:
"""Replace dataframe 'in place'.
If the dataframe has a `_metadata` field, containing a list of attribute
names that contain metadata, then this is copied from `df` to the new
dataframe.
Args:
df: :class:`pandas.DataFrame` to be replaced
df_new: :class:`pandas.DataFrame` to replace the previous one
Returns:
None
"""
# Drop all ROWs (not columns)
if df.index.any():
df.drop(df.index, inplace=True)
for col in df_new.columns:
df[col] = df_new[col]
drop_cols = set(df.columns) - set(df_new.columns)
if drop_cols:
df.drop(drop_cols, axis=1, inplace=True)
_sync_metadata(df_new, df)
# todo: this might be made more elegant in the future for sure...
# fixme: This removes items whenever it can't merge!
[docs]
def merge_dfs(
df: pd.DataFrame,
df_add: pd.DataFrame,
id_df: str,
inplace=False,
id_add="id",
prepend="",
replace=False,
prepend_clash_only=True,
columns=None,
drop_columns=None,
) -> pd.DataFrame | None:
"""
Merge information from two dataframes.
If the dataframe has a `_metadata` field, containing a list of attribute
names that contain metadata, then this is copied from `df` to the new
dataframe.
Args:
df: Original :class:`pandas.DataFrame`
df_add: :class:`pandas.DataFrame` to be merged with original
:class:`pandas.DataFrame`
id_df: Column of original dataframe that contains the id along which
we merge.
inplace: If False, return new dataframe, else update old one
id_add: Column of the new dataframe that contains the id along which
we merge
prepend: Prepend a string to the column names from the new dataframe
replace: Replace columns
prepend_clash_only: Only prepend string to the column names from the
new dataframe if there is a name clash.
columns: Keep only these columns
drop_columns: Drop these columns
Returns:
New merged :class:`pandas.DataFrame`
"""
# Careful: Do not drop the id column until later (else we can't merge)
# Still, we want to remove as much as possible here, because it's probably
# better performing
if columns:
df_add = df_add.drop(
set(df_add.columns) - (set(columns) | {id_add}), axis=1
)
if drop_columns:
df_add = df_add.drop(set(drop_columns) - {id_add}, axis=1)
# Careful: Rename columns after dropping unwanted ones
if prepend_clash_only:
col_clash = set(df.columns) & set(df_add.columns)
rename_dict = {col: prepend + col for col in col_clash}
else:
rename_dict = {col: prepend + col for col in df_add.columns}
df_add = df_add.rename(columns=rename_dict)
# Careful: Might have renamed id_add as well
if id_add in rename_dict:
id_add = rename_dict[id_add]
if replace:
# Simply remove all potential clashes
replaced_columns = set(df_add.columns) & set(df.columns)
df = df.drop(replaced_columns, axis=1)
merge_kwargs = {}
if id_add in df_add.columns:
merge_kwargs["right_on"] = id_add
elif id_add == df_add.index.name:
merge_kwargs["right_index"] = True
else:
raise ValueError(f"'{id_add}' is neither index nor column.")
if id_df in df.columns:
merge_kwargs["left_on"] = id_df
elif id_df == df.index.name:
merge_kwargs["left_index"] = True
else:
raise ValueError(f"'{id_df}' is neither index nor column.")
df_merge = df.merge(df_add, **merge_kwargs)
# Now remove id_add if it was to be removed
# Careful: 'in' doesn't work with None
if (columns and id_add not in columns) or (
drop_columns and id_add in drop_columns
):
df_merge.drop(id_add, axis=1, inplace=True)
# Make sure we don't have two ID columns
new_id_add_col = id_add
if id_add in rename_dict:
new_id_add_col = rename_dict[id_add]
if new_id_add_col in df_merge.columns and id_df != new_id_add_col:
df_merge.drop(new_id_add_col, axis=1, inplace=True)
_sync_metadata(df_merge, df)
if inplace:
replace_df_inplace(df, df_merge)
return None # mypy
else:
return df_merge