-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add DatasetDict * remove python bounds * remove huggingface channel * pin pyarrow to 6.0.0 * use == instead of = in CondaPkg * relax numpy * relax pillow
- Loading branch information
1 parent
c59197a
commit f74d35a
Showing
8 changed files
with
146 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
channels = ["conda-forge", "huggingface"] | ||
channels = ["conda-forge"] | ||
|
||
[deps] | ||
datasets = ">=2.7, <3" | ||
numpy = ">=1.23, <2" | ||
pillow = ">=9.2, <10" | ||
python = ">=3.6, <4" | ||
numpy = ">=1.20, <2" | ||
pillow = ">=9.1, <10" | ||
pyarrow = "==6.0.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
""" | ||
DatasetDict(pydatasetdict::Py; transform = py2jl) | ||
A `DatasetDict` is a dictionary of `Dataset`s. It is a wrapper around a `datasets.DatasetDict` object. | ||
The `transform` is applied to each [`Dataset`](@ref). | ||
The [`py2jl`](@ref) default converts python types to julia types. | ||
See also [`load_dataset`](@ref) and [`Dataset`](@ref). | ||
""" | ||
mutable struct DatasetDict | ||
pyd::Py | ||
transform | ||
|
||
function DatasetDict(pydatasetdict::Py; transform = py2jl) | ||
@assert pyisinstance(pydatasetdict, datasets.DatasetDict) | ||
return new(pydatasetdict, transform) | ||
end | ||
end | ||
|
||
function Base.getproperty(d::DatasetDict, s::Symbol) | ||
if s in fieldnames(DatasetDict) | ||
return getfield(d, s) | ||
else | ||
res = getproperty(getfield(d, :pyd), s) | ||
if pyisinstance(res, datasets.Dataset) | ||
return Dataset(res; d.transform) | ||
elseif pyisinstance(res, datasets.DatasetDict) | ||
return DatasetDict(res; d.transform) | ||
else | ||
return res |> py2jl | ||
end | ||
end | ||
end | ||
|
||
Base.length(d::DatasetDict) = length(d.pyd) | ||
|
||
function Base.getindex(d::DatasetDict, i::AbstractString) | ||
x = d.pyd[i] | ||
return Dataset(x; d.transform) | ||
end | ||
|
||
function set_transform!(d::DatasetDict, transform) | ||
if transform === nothing | ||
d.transform = identity | ||
else | ||
d.transform = transform | ||
end | ||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
|
||
@testset "MNIST" begin | ||
dd = load_dataset("mnist") | ||
|
||
@testset "load_dataset" begin | ||
@test dd isa DatasetDict | ||
@test length(dd) == 2 | ||
end | ||
|
||
@testset "indexing with no transform" begin | ||
tr = dd.transform | ||
set_transform!(dd, identity) | ||
|
||
@test_throws MethodError dd[1] | ||
@test dd["test"] isa Dataset | ||
d = dd["test"] | ||
@test pyisinstance(d[1], pytype(pydict())) | ||
@test d[1]["image"] isa Py | ||
@test d[1]["label"] isa Py | ||
@test pyisinstance(d[1]["label"], pytype(pyint())) | ||
@test py2jl(d[1]["label"]) == 7 | ||
@test py2jl(d[2]["label"]) == 2 | ||
|
||
@test d[1:2] isa Py | ||
@test d[1:2]["image"] isa Py | ||
@test pyisinstance(d[1:2]["image"], pytype(pylist())) | ||
@test d[1:2]["label"] isa Py | ||
@test pyisinstance(d[1:2]["label"], pytype(pylist())) | ||
|
||
set_transform!(dd, tr) | ||
end | ||
|
||
@testset "indexing - py2jl" begin | ||
@test dd.transform === py2jl | ||
d = dd["test"] | ||
sample = d[1] | ||
@test sample isa Dict | ||
@test sample["label"] isa Int | ||
@test sample["label"] == 7 | ||
@test sample["image"] isa Matrix{UInt8} | ||
@test size(sample["image"]) == (28, 28) | ||
|
||
sample = d[1:2] | ||
@test sample isa Dict | ||
@test sample["image"] isa Vector{Matrix{UInt8}} | ||
@test size(sample["image"]) == (2,) | ||
@test sample["label"] isa Vector{Int} | ||
@test size(sample["label"]) == (2,) | ||
end | ||
|
||
@testset "python transforms" begin | ||
@pyexec """ | ||
def pytr(x): | ||
return {"label": [-l for l in x["label"]]} | ||
""" => pytr | ||
dd.set_transform(pytr) | ||
@test dd["test"][1]["label"] == -7 | ||
end | ||
|
||
@testset "getproperty returns julia types" begin | ||
@test dd.num_rows isa Dict{String, Int} | ||
@test dd.num_rows == Dict("test" => 10000, "train" => 60000) | ||
end | ||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
f74d35a
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
f74d35a
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/74572
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via: