Skip to content

Commit

Permalink
feat(python): Add an "include_file_paths" parameter to read_excel a…
Browse files Browse the repository at this point in the history
…nd `read_ods`
  • Loading branch information
alexander-beedie committed Dec 27, 2024
1 parent 2685a86 commit 9f67f7f
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 16 deletions.
48 changes: 37 additions & 11 deletions py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,8 @@
from polars._typing import ExcelSpreadsheetEngine, FileSource, SchemaDict


def _sources(
source: FileSource,
) -> tuple[Any, bool]:
def _sources(source: FileSource) -> tuple[Any, bool]:
"""Unpack any glob patterns, standardise file paths."""
read_multiple_workbooks = True
sources: list[Any] = []

Expand All @@ -60,8 +59,11 @@ def _sources(
for src in source: # type: ignore[union-attr]
if isinstance(src, (str, os.PathLike)) and not Path(src).exists():
src = os.path.expanduser(str(src)) # noqa: PTH111
sources.extend(glob(src, recursive=True)) # noqa: PTH207
sources.extend(files := glob(src, recursive=True)) # noqa: PTH207
read_multiple_workbooks = bool(files)
else:
if isinstance(src, os.PathLike):
src = str(src)
sources.append(src)

return sources, read_multiple_workbooks
Expand Down Expand Up @@ -110,6 +112,7 @@ def read_excel(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -129,6 +132,7 @@ def read_excel(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -148,6 +152,7 @@ def read_excel(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -169,6 +174,7 @@ def read_excel(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -188,6 +194,7 @@ def read_excel(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -207,6 +214,7 @@ def read_excel(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -227,6 +235,7 @@ def read_excel(
columns: Sequence[int] | Sequence[str] | None = None,
schema_overrides: SchemaDict | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
include_file_paths: str | None = None,
drop_empty_rows: bool = True,
drop_empty_cols: bool = True,
raise_if_empty: bool = True,
Expand Down Expand Up @@ -299,6 +308,8 @@ def read_excel(
entire dataset is scanned to determine the dtypes, which can slow parsing for
large workbooks. Note that only the "calamine" and "xlsx2csv" engines support
this parameter.
include_file_paths
Include the path of the source file(s) as a column with this name.
drop_empty_rows
Indicate whether to omit empty rows when reading data into the DataFrame.
drop_empty_cols
Expand Down Expand Up @@ -374,12 +385,12 @@ def read_excel(
read_options=read_options,
schema_overrides=schema_overrides,
infer_schema_length=infer_schema_length,
include_file_paths=include_file_paths,
raise_if_empty=raise_if_empty,
has_header=has_header,
columns=columns,
drop_empty_rows=drop_empty_rows,
drop_empty_cols=drop_empty_cols,
read_multiple_workbooks=read_multiple_workbooks,
)
for src in sources
]
Expand All @@ -399,6 +410,7 @@ def read_ods(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -415,6 +427,7 @@ def read_ods(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -431,6 +444,7 @@ def read_ods(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -447,6 +461,7 @@ def read_ods(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -463,6 +478,7 @@ def read_ods(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -479,6 +495,7 @@ def read_ods(
columns: Sequence[int] | Sequence[str] | None = ...,
schema_overrides: SchemaDict | None = ...,
infer_schema_length: int | None = ...,
include_file_paths: str | None = ...,
drop_empty_rows: bool = ...,
drop_empty_cols: bool = ...,
raise_if_empty: bool = ...,
Expand All @@ -494,6 +511,7 @@ def read_ods(
columns: Sequence[int] | Sequence[str] | None = None,
schema_overrides: SchemaDict | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
include_file_paths: str | None = None,
drop_empty_rows: bool = True,
drop_empty_cols: bool = True,
raise_if_empty: bool = True,
Expand Down Expand Up @@ -529,6 +547,8 @@ def read_ods(
The maximum number of rows to scan for schema inference. If set to `None`, the
entire dataset is scanned to determine the dtypes, which can slow parsing for
large workbooks.
include_file_paths
Include the path of the source file(s) as a column with this name.
drop_empty_rows
Indicate whether to omit empty rows when reading data into the DataFrame.
drop_empty_cols
Expand Down Expand Up @@ -577,12 +597,12 @@ def read_ods(
read_options=None,
schema_overrides=schema_overrides,
infer_schema_length=infer_schema_length,
include_file_paths=include_file_paths,
raise_if_empty=raise_if_empty,
drop_empty_rows=drop_empty_rows,
drop_empty_cols=drop_empty_cols,
has_header=has_header,
columns=columns,
read_multiple_workbooks=read_multiple_workbooks,
)
for src in sources
]
Expand All @@ -593,7 +613,7 @@ def read_ods(


def _read_spreadsheet(
source: str | Path | IO[bytes] | bytes,
source: str | IO[bytes] | bytes,
*,
sheet_id: int | Sequence[int] | None,
sheet_name: str | Sequence[str] | None,
Expand All @@ -602,14 +622,14 @@ def _read_spreadsheet(
read_options: dict[str, Any] | None = None,
schema_overrides: SchemaDict | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
include_file_paths: str | None = None,
columns: Sequence[int] | Sequence[str] | None = None,
has_header: bool = True,
raise_if_empty: bool = True,
drop_empty_rows: bool = True,
drop_empty_cols: bool = True,
read_multiple_workbooks: bool = False,
) -> pl.DataFrame | dict[str, pl.DataFrame]:
if isinstance(source, (str, Path)):
if isinstance(source, str):
source = normalize_filepath(source)
if looks_like_url(source):
source = process_file_url(source)
Expand Down Expand Up @@ -655,6 +675,12 @@ def _read_spreadsheet(
msg = f"no matching sheets found when `sheet_{param}` is {value!r}"
raise ValueError(msg)

if include_file_paths:
workbook = source if isinstance(source, str) else "in-mem"
parsed_sheets = {
name: frame.with_columns(F.lit(workbook).alias(include_file_paths))
for name, frame in parsed_sheets.items()
}
if return_multiple_sheets:
return parsed_sheets
return next(iter(parsed_sheets.values()))
Expand Down Expand Up @@ -762,11 +788,11 @@ def _get_sheet_names(

def _initialise_spreadsheet_parser(
engine: str | None,
source: str | Path | IO[bytes] | bytes,
source: str | IO[bytes] | bytes,
engine_options: dict[str, Any],
) -> tuple[Callable[..., pl.DataFrame], Any, list[dict[str, Any]]]:
"""Instantiate the indicated spreadsheet parser and establish related properties."""
if isinstance(source, (str, Path)) and not Path(source).exists():
if isinstance(source, str) and not Path(source).exists():
raise FileNotFoundError(source)

if engine == "xlsx2csv": # default
Expand Down
19 changes: 14 additions & 5 deletions py-polars/tests/unit/io/test_spreadsheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,14 @@ def test_read_excel_multiple_workbooks(
],
sheet_id=None,
sheet_name="test1",
include_file_paths="path",
**params,
)
expected = pl.DataFrame(
{"hello": ["Row 1", "Row 2", "Row 1", "Row 2", "Row 1", "Row 2"]}
{
"hello": ["Row 1", "Row 2", "Row 1", "Row 2", "Row 1", "Row 2"],
"path": [str(spreadsheet_path)] * 6,
},
)
assert_frame_equal(df, expected)

Expand Down Expand Up @@ -833,11 +837,16 @@ def test_excel_write_compound_types(engine: ExcelSpreadsheetEngine) -> None:
df.write_excel(xls, worksheet="data")

# expect string conversion (only scalar values are supported)
xldf = pl.read_excel(xls, sheet_name="data", engine=engine)
xldf = pl.read_excel(
xls,
sheet_name="data",
engine=engine,
include_file_paths="wbook",
)
assert xldf.rows() == [
("[1, 2]", "{'y': 'a', 'z': 9}"),
("[3, 4]", "{'y': 'b', 'z': 8}"),
("[5, 6]", "{'y': 'c', 'z': 7}"),
("[1, 2]", "{'y': 'a', 'z': 9}", "in-mem"),
("[3, 4]", "{'y': 'b', 'z': 8}", "in-mem"),
("[5, 6]", "{'y': 'c', 'z': 7}", "in-mem"),
]


Expand Down

0 comments on commit 9f67f7f

Please sign in to comment.