diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..700707c --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,7 @@ +# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" # Location of package manifests + schedule: + interval: "weekly" diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 3a1f8d9..fa31dac 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -19,7 +19,7 @@ jobs: matrix: version: - '1.9' - - '1' # add back when 1.10 is out + - '1' - 'nightly' os: - ubuntu-latest @@ -44,18 +44,19 @@ jobs: name: Documentation runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: actions/checkout@v4.2.1 + - uses: julia-actions/setup-julia@v2 with: version: '1' - - uses: julia-actions/julia-buildpkg@v1 - - uses: julia-actions/julia-docdeploy@v1 + - run: | + julia --project=docs -e ' + using Pkg + Pkg.develop(PackageSpec(path=pwd())) + Pkg.instantiate()' + - run: julia --color=yes --project=docs docs/make.jl env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} - - run: | - julia --project=docs -e ' - using Documenter: DocMeta, doctest - using HuggingFaceDatasets - DocMeta.setdocmeta!(HuggingFaceDatasets, :DocTestSetup, :(using HuggingFaceDatasets); recursive=true) - doctest(HuggingFaceDatasets)' + JULIA_CONDAPKG_OPENSSL_VERSION: "ignore" + + diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index 3dfba52..d1891a4 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -15,7 +15,7 @@ jobs: run: which julia continue-on-error: true - name: Install Julia, but only if it is not already available in the PATH - uses: julia-actions/setup-julia@v1 + uses: julia-actions/setup-julia@v2 with: version: '1' arch: ${{ runner.arch }} @@ -41,5 +41,10 @@ jobs: shell: julia --color=yes {0} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # This repo uses Documenter, so we can reuse our [Documenter SSH key](https://documenter.juliadocs.org/stable/man/hosting/walkthrough/). + # If we didn't have one of those setup, we could configure a dedicated ssh deploy key `COMPATHELPER_PRIV` following https://juliaregistries.github.io/CompatHelper.jl/dev/#Creating-SSH-Key. + # Either way, we need an SSH key if we want the PRs that CompatHelper creates to be able to trigger CI workflows themselves. + # That is because GITHUB_TOKEN's can't trigger other workflows (see https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#using-the-github_token-in-a-workflow). + # Check if you have a deploy key setup using these docs: https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/reviewing-your-deploy-keys. COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} # COMPATHELPER_PRIV: ${{ secrets.COMPATHELPER_PRIV }} \ No newline at end of file diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index 90dc100..4bad0ec 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -6,7 +6,7 @@ on: workflow_dispatch: inputs: lookback: - default: 3 + default: "3" permissions: actions: read checks: read diff --git a/CondaPkg.toml b/CondaPkg.toml index 71f629c..89845ec 100644 --- a/CondaPkg.toml +++ b/CondaPkg.toml @@ -1,10 +1,5 @@ -channels = ["conda-forge"] - [deps] -# h5py = "" -# pillow = ">=9.1, <10" -# pyarrow = "==6.0.0" -datasets = ">=2.12, <3" -numpy = ">=1.20, <2" +datasets = ">=3.0, <4" +numpy = ">=2.0, <3" pillow = "" diff --git a/Project.toml b/Project.toml index 8a4c0d9..e333ec3 100644 --- a/Project.toml +++ b/Project.toml @@ -12,7 +12,7 @@ PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" [compat] CondaPkg = "0.2" -DLPack = "0.1" +DLPack = "0.3" ImageCore = "0.9, 0.10" MLUtils = "0.4.1" PythonCall = "0.9" diff --git a/README.md b/README.md index 418ab2d..08d5d53 100644 --- a/README.md +++ b/README.md @@ -23,31 +23,33 @@ HuggingFaceDatasets.jl provides wrappers around types from the `datasets` python Check out the [examples/](https://github.com/JuliaGenAI/HuggingFaceDatasets.jl/tree/main/examples) folder for usage examples. ```julia +julia> using HuggingFaceDatasets + julia> train_data = load_dataset("mnist", split = "train") Dataset({ features: ['image', 'label'], num_rows: 60000 }) -# Indexing starts with 1. -# Python types are returned by default. julia> train_data[1] -Python: {'image': , 'label': 5} +Python: {'image': , 'label': 5} julia> length(train_data) 60000 -# Now we set the julia format julia> train_data = load_dataset("mnist", split = "train").with_format("julia"); -# Returned observations are now julia objects -julia> train_data[1] +julia> train_data[1] # Returned observations are now julia objects Dict{String, Any} with 2 entries: "label" => 5 - "image" => Gray{N0f8}[Gray{N0f8}(0.0) Gray{N0f8}(0.0) … Gray{N0f8}(0.0) Gray{N0f8}(0.0); Gray{N0f8}(0.0) Gray{N0f8}(0.0) … Gray{N0f8}(0.0) Gray{N0f8}(0.0); … ; Gray{N0f8}(0.0) Gray{N0f8}(0.0) …… + "image" => Gray{N0f8}[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0] julia> train_data[1:2] Dict{String, Vector} with 2 entries: "label" => [5, 0] - "image" => ReinterpretArray{Gray{N0f8}, 2, UInt8, Matrix{UInt8}, false}[[Gray{N0f8}(0.0) Gray{N0f8}(0.0) … Gray{N0f8}(0.0) Gray{N0f8}(0.0); Gray{N0f8}(0.0) Gray{N0f8}(0.0) … Gray{N0f8}(0.0) Gra… + "image" => ReinterpretArray{Gray{N0f8}, 2, UInt8, Matrix{UInt8}, false}[[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0… ``` + +## Troubleshooting + +- If having problems in resolving the CondaPkg environment, try to set `ENV["JULIA_CONDAPKG_OPENSSL_VERSION"] = true`before loading the package. See more details [here](https://github.com/JuliaPy/CondaPkg.jl?tab=readme-ov-file#preferences) diff --git a/docs/Manifest.toml b/docs/Manifest.toml deleted file mode 100644 index 69f758e..0000000 --- a/docs/Manifest.toml +++ /dev/null @@ -1,100 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -julia_version = "1.7.2" -manifest_format = "2.0" - -[[deps.ANSIColoredPrinters]] -git-tree-sha1 = "574baf8110975760d391c710b6341da1afa48d8c" -uuid = "a4c015fc-c6ff-483c-b24f-f7ea428134e9" -version = "0.0.1" - -[[deps.Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[deps.Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[deps.DocStringExtensions]] -deps = ["LibGit2"] -git-tree-sha1 = "b19534d1895d702889b219c382a6e18010797f0b" -uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" -version = "0.8.6" - -[[deps.Documenter]] -deps = ["ANSIColoredPrinters", "Base64", "Dates", "DocStringExtensions", "IOCapture", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] -git-tree-sha1 = "122d031e8dcb2d3e767ed434bc4d1ae1788b5a7f" -uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "0.27.17" - -[[deps.HuggingFaceDatasets]] -path = ".." -uuid = "d94b9a45-fdf5-4270-b024-5cbb9ef7117d" -version = "0.1.0" - -[[deps.IOCapture]] -deps = ["Logging", "Random"] -git-tree-sha1 = "f7be53659ab06ddc986428d3a9dcc95f6fa6705a" -uuid = "b5f81e59-6552-4d32-b1f0-c071b021bf89" -version = "0.2.2" - -[[deps.InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[deps.JSON]] -deps = ["Dates", "Mmap", "Parsers", "Unicode"] -git-tree-sha1 = "3c837543ddb02250ef42f4738347454f95079d4e" -uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" -version = "0.21.3" - -[[deps.LibGit2]] -deps = ["Base64", "NetworkOptions", "Printf", "SHA"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[deps.Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[deps.Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[deps.Mmap]] -uuid = "a63ad114-7e13-5084-954f-fe012c677804" - -[[deps.NetworkOptions]] -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" - -[[deps.Parsers]] -deps = ["Dates"] -git-tree-sha1 = "1285416549ccfcdf0c50d4997a94331e88d68413" -uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "2.3.1" - -[[deps.Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[deps.REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[deps.Random]] -deps = ["SHA", "Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[deps.SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" - -[[deps.Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[deps.Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[deps.Test]] -deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] -uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" - -[[deps.Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" diff --git a/docs/make.jl b/docs/make.jl index 691d014..955db59 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -15,6 +15,7 @@ makedocs(; ), pages=[ "Home" => "index.md", + "API" => "api.md", ], ) diff --git a/docs/src/api.md b/docs/src/api.md index a430523..eccdb4f 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -1,12 +1,9 @@ -# API - -## Index - -```@index -Pages = ["api.md"] +```@meta +CurrentModule = HuggingFaceDatasets +CollapsedDocStrings = true ``` -## Docs +# API ```@autodocs Modules = [HuggingFaceDatasets] diff --git a/docs/src/index.md b/docs/src/index.md index 4e18e8f..eecfbe6 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -26,26 +26,30 @@ HuggingFaceDatasets.jl provides wrappers around types from the `datasets` python Check out the `examples/` folder for usage examples. ```julia +# Returned observations are now julia objects +julia> using HuggingFaceDatasets + julia> train_data = load_dataset("mnist", split = "train") -Dataset(, identity) +}) -# Indexing starts with 1. -# By defaul, python types are returned. julia> train_data[1] -Python dict: {'image': , 'label': 5} +Python: {'image': , 'label': 5} -julia> set_format!(train_data, "julia") -Dataset(, HuggingFaceDatasets.py2jl) +julia> length(train_data) +60000 -# Now we have julia types -julia> train_data[1] +julia> train_data = load_dataset("mnist", split = "train").with_format("julia"); + +julia> train_data[1] # Returned observations are now julia objects Dict{String, Any} with 2 entries: "label" => 5 - "image" => UInt8[0x00 0x00 … 0x00 0x00; 0x00 0x00 … 0x00 0x00; … ; 0x00 0x00 … 0x00 0x00; 0x00 0x00 … 0x00 0x00] + "image" => Gray{N0f8}[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0] + +julia> train_data[1:2] +Dict{String, Vector} with 2 entries: + "label" => [5, 0] + "image" => ReinterpretArray{Gray{N0f8}, 2, UInt8, Matrix{UInt8}, false}[[0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 0… ``` diff --git a/src/HuggingFaceDatasets.jl b/src/HuggingFaceDatasets.jl index 3709a5f..bc744cb 100644 --- a/src/HuggingFaceDatasets.jl +++ b/src/HuggingFaceDatasets.jl @@ -3,7 +3,7 @@ module HuggingFaceDatasets using PythonCall using MLUtils: getobs, numobs import MLUtils -using DLPack +using DLPack: DLPack using ImageCore const datasets = PythonCall.pynew() @@ -37,8 +37,9 @@ include("load_dataset.jl") export load_dataset function __init__() + ENV["JULIA_CONDAPKG_OPENSSL_VERSION"] = "ignore" # Since it is illegal in PythonCall to import a python module in a module, we need to do this here. - # https://cjdoris.github.io/PythonCall.jl/dev/pythoncall-reference/#PythonCall.pycopy! + # https://juliapy.github.io/PythonCall.jl/dev/pythoncall-reference/#PythonCall.Core.pycopy! PythonCall.pycopy!(datasets, pyimport("datasets")) PythonCall.pycopy!(PIL, pyimport("PIL")) pyimport("PIL.PngImagePlugin") diff --git a/src/transforms.jl b/src/transforms.jl index c95b798..0e86b66 100644 --- a/src/transforms.jl +++ b/src/transforms.jl @@ -1,10 +1,32 @@ -# See https://github.com/cjdoris/PythonCall.jl/issues/172. -function _pyconvert(x::Py) +""" + py2jl(x) + +Convert Python types to Julia types. It will recursively traverse built-in python +containers such as lists, tuples, dicts, and sets, and convert all nested objects. +On the leaves, it will call either `pyconvert(Any, x)` or [`numpy2jl`](@ref). +""" +py2jl(x) = pyconvert(Any, x) + +function py2jl(x::Py) + # handle datasets if pyisinstance(x, datasets.Dataset) return Dataset(x) elseif pyisinstance(x, datasets.DatasetDict) return DatasetDict(x) + # handle list, tuple, dict, and set + elseif pyisinstance(x, pytype(pylist())) + return [py2jl(x) for x in x] + elseif pyisinstance(x, pytype(pytuple())) + return tuple(py2jl(x) for x in x) + elseif pyisinstance(x, pytype(pydict())) + return Dict(py2jl(k) => py2jl(v) for (k, v) in x.items()) + elseif pyisinstance(x, pytype(pyset())) + return Set(py2jl(x) for x in x) + # handle numpy arrays + elseif pyisinstance(x, np.ndarray) + return numpy2jl(x) + # handle PIL images elseif pyisinstance(x, PIL.PngImagePlugin.PngImageFile) || pyisinstance(x, PIL.JpegImagePlugin.JpegImageFile) a = numpy2jl(np.array(x)) if ndims(a) == 3 && size(a, 1) == 3 @@ -14,49 +36,36 @@ function _pyconvert(x::Py) else error("Unknown image format") end - elseif pyisinstance(x, np.ndarray) - return numpy2jl(x) + # handle other types else return pyconvert(Any, x) end end -# Do nothing on a non-Py object. -_pyconvert(x) = x - -""" - py2jl(x) - -Convert Python types to Julia types applying `pyconvert` recursively. -""" -py2jl - -# py2jl recurses through pycanonicalize and converts through _pyconvert -py2jl(x) = pycanonicalize(_pyconvert(x)) - -pycanonicalize(x) = x - -pycanonicalize(x::PyList) = [py2jl(x) for x in x] -pycanonicalize(x::PyDict) = Dict(py2jl(k) => py2jl(v) for (k, v) in pairs(x)) """ numpy2jl(x) -Convert a numpy array to a Julia array using DLPack. +Convert a numpy array to a Julia array using DLPack.jl. The conversion is copyless, and mutations to the Julia array are reflected in the numpy array. +For row major python arrays, the returned Julia array has permuted dimensions. + +This function is called by [`py2jl`](@ref). +See also [`jl2numpy`](@ref). """ function numpy2jl(x::Py) - # pyconvert(Any, x) - # PyArray(x, copy=false) - if Bool(x.dtype.type == np.str_) - return PyArray(x, copy=false) - else - return DLPack.wrap(x, x -> x.__dlpack__()) - end + return DLPack.from_dlpack(x) end -## TODO this doesn't work yet. -## https://github.com/pabloferz/DLPack.jl/issues/32 -# function jl2numpy(x::AbstractArray) -# return DLPack.share(x, np.from_dlpack) -# end +""" + jl2numpy(x) + +Convert a Julia array to a numpy array using DLPack.jl. +The conversion is copyless, and mutations to the numpy array are reflected in the Julia array. +The returned numpy array has permuted dimensions with respect to the input Julia array. + +See also [`numpy2jl`](@ref). +""" +function jl2numpy(x::AbstractArray) + return DLPack.share(x, np.from_dlpack) +end diff --git a/test/no_ci.jl b/test/no_ci.jl index dc88733..631c09a 100644 --- a/test/no_ci.jl +++ b/test/no_ci.jl @@ -1,5 +1,3 @@ -using HuggingFaceDatasets, ImageShow, Test, ImageCore - @testset "image classification" begin @testset "cifar10" begin ds = load_dataset("cifar10", split = "test").with_format("julia") @@ -25,8 +23,12 @@ end ds = load_dataset("cppe-5", split = "test").with_format("julia") @test ds[1]["image"] isa AbstractMatrix{RGB{N0f8}} @test ds[1]["objects"] isa Dict{String, Vector} - - @test ds[1:2]["image"] isa Vector{<:AbstractMatrix{RGB{N0f8}}} + imgs = ds[1:2]["image"] + @test imgs isa Vector{<:AbstractArray} + @test imgs isa Vector{<:AbstractMatrix} broken=true + @test imgs isa Vector{<:AbstractMatrix{RGB{N0f8}}} broken=true + @test size(imgs[1]) == (1920, 1088) + @test imgs[1] isa AbstractMatrix{RGB{N0f8}} @test ds[1:2]["objects"] isa Vector{Dict{String, Vector}} end end