""" file_io.py: All the file read and write functions """ import csv from pathlib import Path import numpy as np import pandas as pd def read_csv_to_df(fname: str, **kwargs) -> pd.DataFrame: """ Expects sep or delimiter in kwargs. If not included then we fallback on the pandas interpretation Use automatic dialect detection by setting sep to None and engine to python """ if "delimiter" in kwargs and "sep" not in kwargs: kwargs["sep"] = kwargs["delimiter"] kwargs["delimiter"] = None if "sep" in kwargs: return pd.read_csv(fname, **kwargs) # Use automatic dialect detection by setting sep to None and engine to python kwargs["sep"] = None return pd.read_csv(fname, engine="python", **kwargs) def read_ods_format_to_df(fname: str, **kwargs) -> pd.DataFrame: """ Read ODS format to dataframe """ import pyexcel_ods3 data = pyexcel_ods3.get_data(fname, **kwargs) ave_line_length = np.mean([len(line) for line in data]) data_lines = [ line for line in data if len(line) >= ave_line_length ] # assume this is the data header = data_lines[0] data_lines = data_lines[1:] df_dict = {column: [] for column in header} for line in data_lines: for column, pt in zip(df_dict.keys(), line, strict=False): df_dict[column].append(pt) return pd.DataFrame(df_dict) def get_supported_file_types_df(): """ Installed readers """ return [ { "title": "text", "kwargs": { "header": 0, "skipinitialspace": True, "index_col": None, "comment": "#", "quotechar": '"', "quoting": csv.QUOTE_MINIMAL, "engine": "python", "skip_blank_lines": True, }, "extensions": ("csv", "txt"), "writedf": pd.DataFrame.to_csv, "readf": read_csv_to_df, }, { "title": "excel", "kwargs": {"sheet_name": 0, "header": 0, "skiprows": 0}, "extensions": ("xls", "xlsx", "xlsm", "xlsb"), "writedf": pd.DataFrame.to_excel, "readf": pd.read_excel, }, { "title": "ods", "kwargs": {"sheet_name": 0, "header": 0, "skiprows": 0}, "extensions": ("ods", "odt", "odf"), "writedf": None, "readf": read_ods_format_to_df, }, ] def get_supported_file_formats() -> tuple[str]: """ returns collection of all the supported extensions """ extensions = [] for entry in get_supported_file_types_df(): extensions.extend(entry["extensions"]) return tuple(extensions) def read_file_to_df(fname: str, **kwargs) -> pd.DataFrame: """ Cycle through extensions, use the reader object to call """ ext = Path(fname).suffix.strip(".").lower() df = None found = False for reader in get_supported_file_types_df(): if ext in reader["extensions"]: found = True if kwargs is None: kwargs = reader["kwargs"] df = reader["readf"](fname, **kwargs) break if not found: msg = f"Extension {ext} unsupported" raise UserWarning(msg) return pd.DataFrame(df) def write(df: pd.DataFrame, fname: str, **kwargs) -> None: """ Search for the correct exporter and write the dataframe using that writer. """ ext = Path(fname).suffix.strip(".").lower() types = get_supported_file_types_df() writer = None for value in types: if ext in value["extensions"]: writer = value["writedf"] break assert writer writer(df, fname, **kwargs)