Skip to content

Commit

Permalink
Improve Dataset display (#391)
Browse files Browse the repository at this point in the history
* Fix minor bugs

* Update dataframe display
  • Loading branch information
goodwanghan authored Nov 18, 2022
1 parent d35adcd commit eb5b7cc
Show file tree
Hide file tree
Showing 9 changed files with 223 additions and 135 deletions.
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,13 @@ jupyter:
jupyter nbextension enable fugue_notebook --py
jupyter notebook --port=8888 --ip=0.0.0.0 --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*'

lab:
mkdir -p tmp
pip install .
pip install fugue-jupyter
fugue-jupyter install startup
jupyter lab --port=8888 --ip=0.0.0.0 --no-browser --allow-root --NotebookApp.token='' --NotebookApp.password='' --NotebookApp.allow_origin='*'

test:
python3 -b -m pytest --reruns 2 --only-rerun 'Overflow in cast' --only-rerun 'Table or view not found' tests/

Expand Down
11 changes: 8 additions & 3 deletions fugue/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,24 @@
from triad.collections.fs import FileSystem

from fugue.bag.array_bag import ArrayBag
from fugue.bag.bag import Bag
from fugue.bag.bag import Bag, BagDisplay
from fugue.collections.partition import PartitionCursor, PartitionSpec
from fugue.collections.yielded import Yielded, YieldedFile
from fugue.constants import register_global_conf
from fugue.dataframe.array_dataframe import ArrayDataFrame
from fugue.dataframe.arrow_dataframe import ArrowDataFrame
from fugue.dataframe.dataframe import DataFrame, LocalBoundedDataFrame, LocalDataFrame
from fugue.dataframe.dataframe import (
DataFrame,
DataFrameDisplay,
LocalBoundedDataFrame,
LocalDataFrame,
)
from fugue.dataframe.dataframe_iterable_dataframe import LocalDataFrameIterableDataFrame
from fugue.dataframe.dataframes import DataFrames
from fugue.dataframe.iterable_dataframe import IterableDataFrame
from fugue.dataframe.pandas_dataframe import PandasDataFrame
from fugue.dataframe.utils import to_local_bounded_df, to_local_df
from fugue.dataset import Dataset, display_dataset
from fugue.dataset import Dataset, DatasetDisplay, get_dataset_display
from fugue.execution.execution_engine import ExecutionEngine, MapEngine, SQLEngine
from fugue.execution.factory import (
infer_execution_engine,
Expand Down
71 changes: 38 additions & 33 deletions fugue/bag/bag.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
from abc import abstractmethod
from typing import Any, List, Optional

from triad import SerializableRLock

from ..dataset import Dataset, display_dataset
from ..dataset import Dataset, DatasetDisplay, get_dataset_display


class Bag(Dataset):
Expand Down Expand Up @@ -66,33 +64,40 @@ def is_bounded(self) -> bool:
return True


_SHOW_LOCK = SerializableRLock()


@display_dataset.candidate(
lambda ds, *args, **kwargs: isinstance(ds, Bag), priority=0.1
)
def _display_bag(
ds: Bag, n: int = 10, with_count: bool = False, title: Optional[str] = None
):
head_rows = ds.head(n).as_array()
if len(head_rows) < n:
count = len(head_rows)
else:
count = ds.count() if with_count else -1
with _SHOW_LOCK:
if title is not None and title != "":
print(title)
print(type(ds).__name__)
print(head_rows)
if count >= 0:
print(f"Total count: {count}")
print("")
if ds.has_metadata:
print("Metadata:")
try:
# try pretty print, but if not convertible to json, print original
print(ds.metadata.to_json(indent=True))
except Exception: # pragma: no cover
print(ds.metadata)
print("")
class BagDisplay(DatasetDisplay):
""":class:`~.Bag` plain display class"""

@property
def bg(self) -> Bag:
"""The target :class:`~.Bag`"""
return self._ds # type: ignore

def show(
self, n: int = 10, with_count: bool = False, title: Optional[str] = None
) -> None:
head_rows = self.bg.head(n).as_array()
if len(head_rows) < n:
count = len(head_rows)
else:
count = self.bg.count() if with_count else -1
with DatasetDisplay._SHOW_LOCK:
if title is not None and title != "":
print(title)
print(type(self.bg).__name__)
print(head_rows)
if count >= 0:
print(f"Total count: {count}")
print("")
if self.bg.has_metadata:
print("Metadata:")
try:
# try pretty print, but if not convertible to json, print original
print(self.bg.metadata.to_json(indent=True))
except Exception: # pragma: no cover
print(self.bg.metadata)
print("")


@get_dataset_display.candidate(lambda ds: isinstance(ds, Bag), priority=0.1)
def _get_bag_display(ds: Bag):
return BagDisplay(ds)
73 changes: 40 additions & 33 deletions fugue/dataframe/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from .._utils.display import PrettyTable
from ..collections.yielded import Yielded
from ..dataset import Dataset, display_dataset
from ..dataset import Dataset, DatasetDisplay, get_dataset_display
from ..exceptions import FugueDataFrameOperationError


Expand Down Expand Up @@ -374,38 +374,45 @@ def result(self) -> DataFrame:
return self._df


_SHOW_LOCK = SerializableRLock()


@display_dataset.candidate(
lambda ds, *args, **kwargs: isinstance(ds, DataFrame), priority=0.1
)
def _display_dataframe(
ds: DataFrame, n: int = 10, with_count: bool = False, title: Optional[str] = None
):
best_width = 100
head_rows = ds.head(n).as_array(type_safe=True)
if len(head_rows) < n:
count = len(head_rows)
else:
count = ds.count() if with_count else -1
with _SHOW_LOCK:
if title is not None and title != "":
print(title)
print(type(ds).__name__)
tb = PrettyTable(ds.schema, head_rows, best_width)
print("\n".join(tb.to_string()))
if count >= 0:
print(f"Total count: {count}")
print("")
if ds.has_metadata:
print("Metadata:")
try:
# try pretty print, but if not convertible to json, print original
print(ds.metadata.to_json(indent=True))
except Exception: # pragma: no cover
print(ds.metadata)
print("")
class DataFrameDisplay(DatasetDisplay):
""":class:`~.DataFrame` plain display class"""

@property
def df(self) -> DataFrame:
"""The target :class:`~.DataFrame`"""
return self._ds # type: ignore

def show(
self, n: int = 10, with_count: bool = False, title: Optional[str] = None
) -> None:
best_width = 100
head_rows = self.df.head(n).as_array(type_safe=True)
if len(head_rows) < n:
count = len(head_rows)
else:
count = self.df.count() if with_count else -1
with DatasetDisplay._SHOW_LOCK:
if title is not None and title != "":
print(title)
print(type(self.df).__name__)
tb = PrettyTable(self.df.schema, head_rows, best_width)
print("\n".join(tb.to_string()))
if count >= 0:
print(f"Total count: {count}")
print("")
if self.df.has_metadata:
print("Metadata:")
try:
# try pretty print, but if not convertible to json, print original
print(self.df.metadata.to_json(indent=True))
except Exception: # pragma: no cover
print(self.df.metadata)
print("")


@get_dataset_display.candidate(lambda ds: isinstance(ds, DataFrame), priority=0.1)
def _get_dataframe_display(ds: DataFrame):
return DataFrameDisplay(ds)


def _get_schema_change(
Expand Down
98 changes: 65 additions & 33 deletions fugue/dataset.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,11 @@
import html
from abc import ABC, abstractmethod
from typing import Any, Optional
from triad import ParamDict, assert_or_throw
from .exceptions import FugueDatasetEmptyError
from ._utils.registry import fugue_plugin


@fugue_plugin
def display_dataset(
ds: "Dataset", n: int = 10, with_count: bool = False, title: Optional[str] = None
) -> None: # pragma: no cover
"""General function to display a :class:`~.Dataset`
.. admonition:: Example: how to register a custom display
.. code-block:: python

from fugue import display_dataset, DataFrame
from triad import ParamDict, SerializableRLock, assert_or_throw

# higher priority will overwrite the existing display functions
@display_dataset.candidate(
lambda ds, *args, **kwargs: isinstance(ds, DataFrame), priority=1.0)
def my_dataframe_display(ds, n=10, with_count=False, title=None):
print(type(ds))
:param ds: the Dataset to be displayed
:param n: top n items to display, defaults to 10
:param with_count: whether to display the total count, defaults to False
:param title: title to display, defaults to None
"""

raise NotImplementedError(f"No matching display function registered for {type(ds)}")
from ._utils.registry import fugue_plugin
from .exceptions import FugueDatasetEmptyError


class Dataset(ABC):
Expand Down Expand Up @@ -91,6 +67,13 @@ def count(self) -> int: # pragma: no cover
"""Get number of rows of this dataframe"""
raise NotImplementedError

def assert_not_empty(self) -> None:
"""Assert this dataframe is not empty
:raises FugueDatasetEmptyError: if it is empty
"""
assert_or_throw(not self.empty, FugueDatasetEmptyError("dataframe is empty"))

def show(
self, n: int = 10, with_count: bool = False, title: Optional[str] = None
) -> None:
Expand All @@ -107,11 +90,60 @@ def show(
need to :func:`fugue.execution.execution_engine.ExecutionEngine.persist`
the dataset.
"""
return display_dataset(self, n=n, with_count=with_count, title=title)
return get_dataset_display(self).show(n=n, with_count=with_count, title=title)

def assert_not_empty(self) -> None:
"""Assert this dataframe is not empty
def __repr__(self):
"""String representation of the Dataset"""
return get_dataset_display(self).repr()

:raises FugueDatasetEmptyError: if it is empty
def _repr_html_(self):
"""HTML representation of the Dataset"""
return get_dataset_display(self).repr_html()


class DatasetDisplay(ABC):
"""The base class for display handlers of :class:`~.Dataset`
:param ds: the Dataset
"""

_SHOW_LOCK = SerializableRLock()

def __init__(self, ds: Dataset):
self._ds = ds

@abstractmethod
def show(
self, n: int = 10, with_count: bool = False, title: Optional[str] = None
) -> None: # pragma: no cover
"""Show the :class:`~.Dataset`
:param n: top n items to display, defaults to 10
:param with_count: whether to display the total count, defaults to False
:param title: title to display, defaults to None
"""
assert_or_throw(not self.empty, FugueDatasetEmptyError("dataframe is empty"))
raise NotImplementedError

def repr(self) -> str:
"""The string representation of the :class:`~.Dataset`
:return: the string representation
"""
return str(type(self._ds).__name__)

def repr_html(self) -> str:
"""The HTML representation of the :class:`~.Dataset`
:return: the HTML representation
"""
return html.escape(self.repr())


@fugue_plugin
def get_dataset_display(ds: "Dataset") -> DatasetDisplay: # pragma: no cover
"""Get the display class to display a :class:`~.Dataset`
:param ds: the Dataset to be displayed
"""

raise NotImplementedError(f"No matching DatasetDisplay registered for {type(ds)}")
17 changes: 6 additions & 11 deletions fugue/extensions/_builtins/outputters.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,18 @@
from typing import List, no_type_check

from triad import ParamDict, Schema, SerializableRLock, assert_or_throw
from triad.utils.convert import to_type

from fugue.collections.partition import PartitionCursor
from fugue.dataframe import DataFrame, DataFrames, LocalDataFrame
from fugue.dataframe.array_dataframe import ArrayDataFrame
from fugue.dataframe.utils import _df_eq, to_local_bounded_df
from fugue.dataset import display_dataset
from fugue.exceptions import FugueWorkflowError
from fugue.execution.execution_engine import _generate_comap_empty_dfs
from fugue.extensions.outputter import Outputter
from fugue.extensions.transformer.convert import _to_output_transformer
from fugue.extensions.transformer.transformer import CoTransformer, Transformer
from fugue.rpc import EmptyRPCHandler, to_rpc_handler
from triad import SerializableRLock
from triad.collections.dict import ParamDict
from triad.collections.schema import Schema
from triad.utils.assertion import assert_or_throw
from triad.utils.convert import to_type


class Show(Outputter):
Expand All @@ -27,12 +24,10 @@ def process(self, dfs: DataFrames) -> None:
n = self.params.get("n", 10)
with_count = self.params.get("with_count", False)
with Show.LOCK:
n = 0
m = 0
for df in dfs.values():
display_dataset(
df, n=n, with_count=with_count, title=title if n == 0 else None
)
n += 1
df.show(n=n, with_count=with_count, title=title if m == 0 else None)
m += 1


class AssertEqual(Outputter):
Expand Down
Loading

0 comments on commit eb5b7cc

Please sign in to comment.