diff --git a/CHANGELOG.md b/CHANGELOG.md index b264433..b00b7e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## [2021-02-10 - Version 1.3.0](https://github.com/matajoh/libnpy/releases/tag/v1.3.0) + +New Features: +- Support for Unicode string tensors (npy type 'U') + +Breaking change: +- `CopyFrom` interface for C# Tensors has been changed to use *Buffer objects + ## [2021-02-09 - Version 1.2.2](https://github.com/matajoh/libnpy/releases/tag/v1.2.2) Improvements: diff --git a/CSharpWrapper/NumpyIONative.i b/CSharpWrapper/NumpyIONative.i index f553680..9cffc84 100644 --- a/CSharpWrapper/NumpyIONative.i +++ b/CSharpWrapper/NumpyIONative.i @@ -8,6 +8,7 @@ %include "std_vector.i" %include "std_string.i" +%include "std_wstring.i" %include "stdint.i" %include "arrays_csharp.i" %include "typemaps.i" @@ -25,7 +26,8 @@ enum class data_type_t : std::uint8_t { INT64, UINT64, FLOAT32, - FLOAT64 + FLOAT64, + UNICODE_STRING }; %rename(Endian) endian_t; @@ -43,6 +45,13 @@ enum class compression_method_t : std::uint16_t { DEFLATED = 8 }; +%typemap(ctype, out="void *") const wstring * "wchar_t *" +%typemap(imtype, + inattributes="[global::System.Runtime.InteropServices.MarshalAs(UnmanagedType.LPArray, ArraySubType=UnmanagedType.LPStr)]", + outattributes="[return: global::System.Runtime.InteropServices.MarshalAs(UnmanagedType.LPArray, ArraySubType=UnmanagedType.LPStr)]" + ) const wstring * "string[]" +%typemap(cstype) const wstring * "string[]" + %template(UInt8Buffer) std::vector; %template(Int8Buffer) std::vector; %template(UInt16Buffer) std::vector; @@ -53,6 +62,8 @@ enum class compression_method_t : std::uint16_t { %template(Int64Buffer) std::vector; %template(Float32Buffer) std::vector; %template(Float64Buffer) std::vector; +%apply const std::wstring & {std::wstring &}; +%template(UnicodeStringBuffer) std::vector; %template(Shape) std::vector; @@ -96,17 +107,6 @@ header_info peek(const std::string& path); template class tensor { public: - %apply unsigned char FIXED[] {const unsigned char *source}; - %apply signed char FIXED[] {const signed char *source}; - %apply unsigned short FIXED[] {const unsigned short *source}; - %apply short FIXED[] {const short *source}; - %apply unsigned int FIXED[] {const unsigned int *source}; - %apply int FIXED[] {const int *source}; - %apply unsigned long long FIXED[] {const unsigned long long *source}; - %apply long long FIXED[] {const long long *source}; - %apply float FIXED[] {const float *source}; - %apply double FIXED[] {const double *source}; - %exception tensor(const std::string& path) %{ try{ $action @@ -139,7 +139,7 @@ public: %rename(Save) save; void save(const std::string& path, endian_t endian = endian_t::NATIVE); - %exception copy_from(const T* source, size_t nitems) %{ + %exception copy_from(const std::vector& source) %{ try{ $action } catch (std::invalid_argument& e){ @@ -150,7 +150,7 @@ public: %csmethodmodifiers copy_from "public unsafe override"; %rename(CopyFrom) copy_from; - void copy_from(const T* source, size_t itemCount); + void copy_from(const std::vector& source); %csmethodmodifiers values "protected override" %rename(getValues) values; @@ -223,6 +223,8 @@ public: %template(Float32Tensor) tensor; %typemap(csbase) SWIGTYPE "Tensor"; %template(Float64Tensor) tensor; +%typemap(csbase) SWIGTYPE "Tensor"; +%template(UnicodeStringTensor) tensor; %typemap(csbase) SWIGTYPE "" @@ -261,6 +263,7 @@ public: %template(Write) write; %template(Write) write; %template(Write) write; + %template(Write) write; }; %rename(NPZInputStream) inpzstream; @@ -329,4 +332,5 @@ public: %template(ReadInt64) read; %template(ReadFloat32) read; %template(ReadFloat64) read; + %template(ReadUnicodeString) read; }; diff --git a/CSharpWrapper/Tensor.cs b/CSharpWrapper/Tensor.cs index 6cb7124..6ef27cc 100644 --- a/CSharpWrapper/Tensor.cs +++ b/CSharpWrapper/Tensor.cs @@ -22,12 +22,11 @@ namespace NumpyIO public abstract class Tensor where B : IList { /// - /// Copy the data from the provided array. These values will + /// Copy the data from the provided buffer. These values will /// be copied into the underlying C++ type. /// - /// The source array - /// The number of items to copy - public abstract void CopyFrom(T[] source, uint nitems); + /// The source buffer + public abstract void CopyFrom(B source); /// /// Save the tensor to the provided location on the disk. diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 8be32a9..251684b 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,2 +1,5 @@ -Improvements: -- Bug fix for a missing comma on 1d shape \ No newline at end of file +New Features: +- Support for Unicode string tensors (npy type 'U') + +Breaking change: +- `CopyFrom` interface for C# Tensors has been changed to use *Buffer objects \ No newline at end of file diff --git a/VERSION b/VERSION index d2d61a7..589268e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.2 \ No newline at end of file +1.3.0 \ No newline at end of file diff --git a/assets/test/test.npz b/assets/test/test.npz index 14d2d4b..2611bf6 100644 Binary files a/assets/test/test.npz and b/assets/test/test.npz differ diff --git a/assets/test/test_compressed.npz b/assets/test/test_compressed.npz index 2acf99f..e52805c 100644 Binary files a/assets/test/test_compressed.npz and b/assets/test/test_compressed.npz differ diff --git a/assets/test/unicode.npy b/assets/test/unicode.npy new file mode 100644 index 0000000..80a0b56 Binary files /dev/null and b/assets/test/unicode.npy differ diff --git a/include/npy/core.h b/include/npy/core.h index 413e5ed..a88db5a 100644 --- a/include/npy/core.h +++ b/include/npy/core.h @@ -66,7 +66,9 @@ enum class data_type_t : std::uint8_t /** 32-bit floating point value (float) */ FLOAT32, /** 64-bit floating point value (double) */ - FLOAT64 + FLOAT64, + /** Unicode string (std::wstring) */ + UNICODE_STRING }; /** Convert a data type and endianness to a NPY dtype string. diff --git a/include/npy/npy.h b/include/npy/npy.h index 41bce14..cd0983d 100644 --- a/include/npy/npy.h +++ b/include/npy/npy.h @@ -56,6 +56,9 @@ struct header_info /** A vector of values indicating the shape of each dimension of the tensor. */ std::vector shape; + + /** Value used to indicate the maximum length of an element (used by Unicode strings) */ + std::size_t max_element_length; }; /** Writes an NPY header to the provided stream. @@ -110,6 +113,25 @@ void write_npy_header(std::basic_ostream &output, output.write(reinterpret_cast(end.data()), end.length()); } +template +void copy_to(const T* data_ptr, std::size_t num_elements, std::basic_ostream& output, npy::endian_t endianness) +{ + if (endianness == npy::endian_t::NATIVE || endianness == native_endian()) + { + output.write(reinterpret_cast(data_ptr), num_elements * sizeof(T)); + } + else + { + CHAR buffer[sizeof(T)]; + for (auto curr = data_ptr; curr < data_ptr + num_elements; ++curr) + { + const CHAR *start = reinterpret_cast(curr); + std::reverse_copy(start, start + sizeof(T), buffer); + output.write(buffer, sizeof(T)); + } + } +} + /** Saves a tensor to the provided stream. * \tparam T the data type * \tparam TENSOR the tensor type. @@ -120,32 +142,72 @@ void write_npy_header(std::basic_ostream &output, */ template class TENSOR, - typename CHAR> + typename CHAR, + std::enable_if_t::value, int> = 42> void save(std::basic_ostream &output, const TENSOR &tensor, endian_t endianness = npy::endian_t::NATIVE) { auto dtype = to_dtype(tensor.dtype(), endianness); write_npy_header(output, dtype, tensor.fortran_order(), tensor.shape()); + copy_to(tensor.data(), tensor.size(), output, endianness); +}; - if (endianness == npy::endian_t::NATIVE || - endianness == native_endian() || - dtype[0] == '|') +/** Saves a unicode string tensor to the provided stream. + * \tparam TENSOR the tensor type. + * \param output the output stream + * \param tensor the tensor + * \param endianness the endianness to use in saving the tensor + * \sa npy::tensor + */ +template class TENSOR, + typename CHAR, + std::enable_if_t::value, int> = 42> +void save(std::basic_ostream &output, + const TENSOR &tensor, + endian_t endianness = npy::endian_t::NATIVE) +{ + std::size_t max_length = 0; + for(const auto& element : tensor) { - output.write(reinterpret_cast(tensor.data()), tensor.size() * sizeof(T)); + if(element.size() > max_length) + { + max_length = element.size(); + } } - else + + if(endianness == npy::endian_t::NATIVE) { - CHAR buffer[sizeof(T)]; - for (auto curr = tensor.data(); curr < tensor.data() + tensor.size(); ++curr) + endianness = native_endian(); + } + + std::string dtype = ">U" + std::to_string(max_length); + if(endianness == npy::endian_t::LITTLE) + { + dtype = " unicode(tensor.size() * max_length, 0); + auto word_start = unicode.begin(); + for(const auto& element : tensor) + { + auto char_it = word_start; + for(const auto& wchar : element) { - const CHAR *start = reinterpret_cast(curr); - std::reverse_copy(start, start + sizeof(T), buffer); - output.write(buffer, sizeof(T)); + *char_it = static_cast(wchar); + char_it += 1; } + + word_start += max_length; } + + copy_to(unicode.data(), unicode.size(), output, endianness); }; + /** Saves a tensor to the provided location on disk. * \tparam T the data type * \tparam TENSOR the tensor type. @@ -166,7 +228,7 @@ void save(const std::string &path, throw std::invalid_argument("path"); } - save(output, tensor, endianness); + save(output, tensor, endianness); }; /** Read an NPY header from the provided stream. @@ -202,6 +264,26 @@ header_info read_npy_header(std::basic_istream &input) return header_info(dictionary); } +template +void copy_to(std::basic_istream &input, T* data_ptr, std::size_t num_elements, npy::endian_t endianness) +{ + if (endianness == npy::endian_t::NATIVE || endianness == native_endian()) + { + CHAR *start = reinterpret_cast(data_ptr); + input.read(start, num_elements * sizeof(T)); + } + else + { + CHAR buffer[sizeof(T)]; + for (auto curr = data_ptr; curr < data_ptr + num_elements; ++curr) + { + input.read(buffer, sizeof(T)); + CHAR *start = reinterpret_cast(curr); + std::reverse_copy(buffer, buffer + sizeof(T), start); + } + } +} + /** Loads a tensor in NPY format from the provided stream. The type of the tensor * must match the data to be read. * \tparam T the data type @@ -212,7 +294,8 @@ header_info read_npy_header(std::basic_istream &input) */ template class TENSOR, - typename CHAR> + typename CHAR, + std::enable_if_t::value, int> = 42> TENSOR load(std::basic_istream &input) { header_info info = read_npy_header(input); @@ -222,20 +305,45 @@ TENSOR load(std::basic_istream &input) throw std::logic_error("requested dtype does not match stream's dtype"); } - if (info.endianness == npy::endian_t::NATIVE || info.endianness == native_endian()) + copy_to(input, tensor.data(), tensor.size(), info.endianness); + return tensor; +} + + +/** Loads a unicode string tensor in NPY format from the provided stream. The type of the tensor + * must match the data to be read. + * \tparam T the data type + * \tparam TENSOR the tensor type + * \param input the input stream + * \return an object of type TENSOR read from the stream + * \sa npy::tensor + */ +template class TENSOR, + typename CHAR, + std::enable_if_t::value, int> = 42> +TENSOR load(std::basic_istream &input) +{ + header_info info = read_npy_header(input); + TENSOR tensor(info.shape, info.fortran_order); + if (info.dtype != tensor.dtype()) { - CHAR *start = reinterpret_cast(tensor.data()); - input.read(start, tensor.size() * sizeof(T)); + throw std::logic_error("requested dtype does not match stream's dtype"); } - else + + std::vector unicode(tensor.size() * info.max_element_length, 0); + copy_to(input, unicode.data(), unicode.size(), info.endianness); + + auto word_start = unicode.begin(); + for(auto& element : tensor) { - CHAR buffer[sizeof(T)]; - for (auto curr = tensor.data(); curr < tensor.data() + tensor.size(); ++curr) + auto char_it = word_start; + for(std::size_t i=0; i 0; ++i, ++char_it) { - input.read(buffer, sizeof(T)); - CHAR *start = reinterpret_cast(curr); - std::reverse_copy(buffer, buffer + sizeof(T), start); + element.push_back(static_cast(*char_it)); } + + word_start += info.max_element_length; } return tensor; diff --git a/include/npy/npz.h b/include/npy/npz.h index b8fae7b..a4fb7fa 100644 --- a/include/npy/npz.h +++ b/include/npy/npz.h @@ -95,7 +95,7 @@ class onpzstream } omemstream output; - save(output, tensor); + save(output, tensor); std::string suffix = ".npy"; std::string name = filename; diff --git a/src/npy.cpp b/src/npy.cpp index cfd29b8..57003bc 100644 --- a/src/npy.cpp +++ b/src/npy.cpp @@ -111,7 +111,19 @@ header_info::header_info(const std::string &dictionary) skip_whitespace(input); if (key == "descr") { - std::tie(this->dtype, this->endianness) = from_dtype(read_string(input)); + std::string dtype = read_string(input); + if(dtype[1] == 'U') + { + this->dtype = npy::data_type_t::UNICODE_STRING; + this->endianness = dtype[0] == '>' ? npy::endian_t::BIG : npy::endian_t::LITTLE; + this->max_element_length = std::stoi(dtype.substr(2)); + } + else + { + std::tie(this->dtype, this->endianness) = from_dtype(dtype); + this->max_element_length = 0; + } + } else if (key == "fortran_order") { diff --git a/src/tensor.cpp b/src/tensor.cpp index 664e690..0fc904b 100644 --- a/src/tensor.cpp +++ b/src/tensor.cpp @@ -62,4 +62,10 @@ data_type_t tensor::get_dtype() return data_type_t::FLOAT64; }; +template <> +data_type_t tensor::get_dtype() +{ + return data_type_t::UNICODE_STRING; +} + } // namespace npy \ No newline at end of file diff --git a/test/CSharpTests/test_exceptions.cs b/test/CSharpTests/test_exceptions.cs index eceed25..7d130ca 100644 --- a/test/CSharpTests/test_exceptions.cs +++ b/test/CSharpTests/test_exceptions.cs @@ -56,8 +56,8 @@ static void NPZInputStreamPeekInvalidFilename() static void TensorCopyFrom() { - byte[] buffer = new byte[10]; - TENSOR.CopyFrom(buffer, 10); + UInt8Buffer buffer = new UInt8Buffer(new byte[10]); + TENSOR.CopyFrom(buffer); } static void TensorIndexSize() diff --git a/test/CSharpTests/test_npz_read.cs b/test/CSharpTests/test_npz_read.cs index 11009e8..a365d62 100644 --- a/test/CSharpTests/test_npz_read.cs +++ b/test/CSharpTests/test_npz_read.cs @@ -11,12 +11,14 @@ static void TestRead(bool compressed, ref int result) { UInt8Tensor expectedColor = Test.Tensor(new Shape(new uint[] { 5, 5, 3 })); Float32Tensor expectedDepth = Test.Tensor(new Shape(new uint[] { 5, 5 })); + UnicodeStringTensor expectedUnicode = Test.Tensor(new Shape(new uint[]{5, 2, 5})); string filename = compressed ? "test_compressed.npz" : "test.npz"; NPZInputStream stream = new NPZInputStream(Test.AssetPath(filename)); - UInt8Tensor actualColor = stream.ReadUInt8("color.npy"); + UInt8Tensor actualColor = stream.ReadUInt8("color"); Float32Tensor actualDepth = stream.ReadFloat32("depth.npy"); + UnicodeStringTensor actualUnicode = stream.ReadUnicodeString("unicode.npy"); string tag = "c#_npz_read"; if (compressed) @@ -26,6 +28,7 @@ static void TestRead(bool compressed, ref int result) Test.AssertEqual(expectedColor, actualColor, ref result, tag + " color"); Test.AssertEqual(expectedDepth, actualDepth, ref result, tag + " depth"); + Test.AssertEqual(expectedUnicode, actualUnicode, ref result, tag + " unicode"); } public static int Main() { diff --git a/test/CSharpTests/test_npz_write.cs b/test/CSharpTests/test_npz_write.cs index 2f137e2..748da13 100644 --- a/test/CSharpTests/test_npz_write.cs +++ b/test/CSharpTests/test_npz_write.cs @@ -14,10 +14,12 @@ static void TestWrite(bool compressed, ref int result) UInt8Tensor color = Test.Tensor(new Shape(new uint[] { 5, 5, 3 })); Float32Tensor depth = Test.Tensor(new Shape(new uint[] { 5, 5 })); + UnicodeStringTensor unicode = Test.Tensor(new Shape(new uint[]{5, 2, 5})); string path = Path.GetRandomFileName(); NPZOutputStream stream = new NPZOutputStream(path, compressed ? CompressionMethod.DEFLATED : CompressionMethod.STORED); stream.Write("color.npy", color); - stream.Write("depth.npy", depth); + stream.Write("depth", depth); + stream.Write("unicode.npy", unicode); stream.Close(); byte[] actual = File.ReadAllBytes(path); diff --git a/test/libnpy_tests.h b/test/libnpy_tests.h index 9581338..5fff5d0 100644 --- a/test/libnpy_tests.h +++ b/test/libnpy_tests.h @@ -90,6 +90,28 @@ inline void assert_equal(const std::string &expected, } } +template<> +inline void assert_equal(const std::wstring &expected, + const std::wstring &actual, + int &result, + const std::string &tag) +{ + assert_equal(expected.length(), actual.length(), result, tag + " length"); + if (result == EXIT_SUCCESS) + { + for (std::size_t i = 0; i < expected.size(); ++i) + { + int expected_val = static_cast(expected[i]); + int actual_val = static_cast(actual[i]); + assert_equal(expected_val, actual_val, result, tag + "[" + std::to_string(i) + "]"); + if (result == EXIT_FAILURE) + { + break; + } + } + } +} + template <> inline void assert_equal(const npy::header_info &expected, const npy::header_info &actual, @@ -102,6 +124,29 @@ inline void assert_equal(const npy::header_info &expected, assert_equal(expected.shape, actual.shape, result, tag + " shape"); } +template<> +inline void assert_equal>(const npy::tensor &expected, + const npy::tensor &actual, + int &result, + const std::string &tag) +{ + assert_equal(to_dtype(expected.dtype()), to_dtype(actual.dtype()), result, tag + " dtype"); + assert_equal(expected.fortran_order(), actual.fortran_order(), result, tag + " fortran_order"); + assert_equal(expected.shape(), actual.shape(), result, tag + " shape"); + + auto expected_it = expected.begin(); + auto actual_it = actual.begin(); + for (std::size_t i = 0; i < expected.size(); ++i, ++expected_it, ++actual_it) + { + if (*expected_it != *actual_it) + { + result = EXIT_FAILURE; + std::wcout << std::wstring(tag.begin(), tag.end()) << " is incorrect: " << *actual_it << " != " << *expected_it << std::endl; + break; + } + } +} + template void assert_throws(void (*function)(), int &result, const std::string &tag) { @@ -136,6 +181,20 @@ npy::tensor test_tensor(const std::vector &shape) return tensor; }; +template <> +inline npy::tensor test_tensor(const std::vector &shape) +{ + npy::tensor tensor(shape); + int i=0; + for(auto& word : tensor) + { + word = std::to_wstring(i); + i += 1; + } + + return tensor; +} + template npy::tensor test_fortran_tensor() { @@ -156,12 +215,32 @@ npy::tensor test_fortran_tensor() return tensor; } +template <> +inline npy::tensor test_fortran_tensor() +{ + std::vector values = { + 0, 10, 20, 30, 40, 5, 15, 25, 35, 45, + 1, 11, 21, 31, 41, 6, 16, 26, 36, 46, + 2, 12, 22, 32, 42, 7, 17, 27, 37, 47, + 3, 13, 23, 33, 43, 8, 18, 28, 38, 48, + 4, 14, 24, 34, 44, 9, 19, 29, 39, 49}; + npy::tensor tensor({5, 2, 5}, true); + auto dst = tensor.data(); + auto src = values.begin(); + for (; dst < tensor.data() + tensor.size(); ++src, ++dst) + { + *dst = std::to_wstring(*src); + } + + return tensor; +} + template std::string npy_stream(npy::endian_t endianness = npy::endian_t::NATIVE) { std::ostringstream actual_stream; npy::tensor tensor = test_tensor({5, 2, 5}); - npy::save(actual_stream, tensor, endianness); + npy::save(actual_stream, tensor, endianness); return actual_stream.str(); } diff --git a/test/npy_read.cpp b/test/npy_read.cpp index 906bf49..66cdfef 100644 --- a/test/npy_read.cpp +++ b/test/npy_read.cpp @@ -1,6 +1,7 @@ #include "libnpy_tests.h" #include "npy_read.h" + int test_npy_read() { int result = EXIT_SUCCESS; @@ -19,6 +20,7 @@ int test_npy_read() test_read(result, "int64"); test_read(result, "float32"); test_read(result, "float64"); + test_read(result, "unicode"); return result; } \ No newline at end of file diff --git a/test/npy_write.cpp b/test/npy_write.cpp index 4145313..8cfe01c 100644 --- a/test/npy_write.cpp +++ b/test/npy_write.cpp @@ -67,5 +67,9 @@ int test_npy_write() actual = test::npy_stream(npy::endian_t::LITTLE); test::assert_equal(expected, actual, result, "npy_write_float64"); + expected = test::read_asset("unicode.npy"); + actual = test::npy_stream(npy::endian_t::LITTLE); + test::assert_equal(expected, actual, result, "npy_write_unicode"); + return result; }; diff --git a/test/npz_read.cpp b/test/npz_read.cpp index 294c862..cc15783 100644 --- a/test/npz_read.cpp +++ b/test/npz_read.cpp @@ -7,14 +7,17 @@ void _test(int &result, const std::string &filename, bool compressed) { auto expected_color = test::test_tensor({5, 5, 3}); auto expected_depth = test::test_tensor({5, 5}); + auto expected_unicode = test::test_tensor({5, 2, 5}); npy::inpzstream stream(test::asset_path(filename)); auto actual_color = stream.read("color.npy"); auto actual_depth = stream.read("depth"); + auto actual_unicode = stream.read("unicode"); std::string suffix = compressed ? "_compressed" : ""; test::assert_equal(expected_color, actual_color, result, "npz_read_color" + suffix); test::assert_equal(expected_depth, actual_depth, result, "npz_read_depth" + suffix); + test::assert_equal(expected_unicode, actual_unicode, result, "npz_read_unicode" + suffix); } } // namespace diff --git a/test/npz_write.cpp b/test/npz_write.cpp index 6d23831..75f6d3c 100644 --- a/test/npz_write.cpp +++ b/test/npz_write.cpp @@ -30,6 +30,7 @@ void _test(int &result, npy::compression_method_t compression_method) npy::onpzstream npz(TEMP_NPZ, compression_method, npy::endian_t::LITTLE); npz.write("color", test::test_tensor({5, 5, 3})); npz.write("depth.npy", test::test_tensor({5, 5})); + npz.write("unicode.npy", test::test_tensor({5, 2, 5})); } std::string actual = test::read_file(TEMP_NPZ);