Adding Unicode support (#21)

* Adding unicode * All tests passing * Bumping the version * Adding C# + adding tests * Adding to release notes
matajoh · Feb 11, 2021 · 4f19e7d · 4f19e7d
1 parent 8609cd2
commit 4f19e7d
Show file tree

Hide file tree

Showing 21 changed files with 287 additions and 51 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## [2021-02-10 - Version 1.3.0](https://github.com/matajoh/libnpy/releases/tag/v1.3.0)
+
+New Features:
+- Support for Unicode string tensors (npy type 'U')
+
+Breaking change:
+- `CopyFrom` interface for C# Tensors has been changed to use *Buffer objects
+
 ## [2021-02-09 - Version 1.2.2](https://github.com/matajoh/libnpy/releases/tag/v1.2.2)
 
 Improvements:

diff --git a/CSharpWrapper/NumpyIONative.i b/CSharpWrapper/NumpyIONative.i
@@ -8,6 +8,7 @@
 
 %include "std_vector.i"
 %include "std_string.i"
+%include "std_wstring.i"
 %include "stdint.i"
 %include "arrays_csharp.i"
 %include "typemaps.i"
@@ -25,7 +26,8 @@ enum class data_type_t : std::uint8_t {
     INT64,
     UINT64,
     FLOAT32,
-    FLOAT64
+    FLOAT64,
+    UNICODE_STRING
 };
 
 %rename(Endian) endian_t;
@@ -43,6 +45,13 @@ enum class compression_method_t : std::uint16_t {
     DEFLATED = 8
 };
 
+%typemap(ctype, out="void *") const wstring * "wchar_t *"
+%typemap(imtype,
+         inattributes="[global::System.Runtime.InteropServices.MarshalAs(UnmanagedType.LPArray, ArraySubType=UnmanagedType.LPStr)]",
+         outattributes="[return: global::System.Runtime.InteropServices.MarshalAs(UnmanagedType.LPArray, ArraySubType=UnmanagedType.LPStr)]"
+         ) const wstring * "string[]"
+%typemap(cstype) const wstring * "string[]"
+
 %template(UInt8Buffer) std::vector<unsigned char>;
 %template(Int8Buffer) std::vector<signed char>;
 %template(UInt16Buffer) std::vector<unsigned short>;
@@ -53,6 +62,8 @@ enum class compression_method_t : std::uint16_t {
 %template(Int64Buffer) std::vector<long long>;
 %template(Float32Buffer) std::vector<float>;
 %template(Float64Buffer) std::vector<double>;
+%apply const std::wstring & {std::wstring &};
+%template(UnicodeStringBuffer) std::vector<std::wstring>;
 
 %template(Shape) std::vector<size_t>;
 
@@ -96,17 +107,6 @@ header_info peek(const std::string& path);
 template <typename T>
 class tensor {
 public:
-    %apply unsigned char FIXED[] {const unsigned char *source};
-    %apply signed char FIXED[] {const signed char *source};
-    %apply unsigned short FIXED[] {const unsigned short *source};
-    %apply short FIXED[] {const short *source};
-    %apply unsigned int FIXED[] {const unsigned int *source};
-    %apply int FIXED[] {const int *source};
-    %apply unsigned long long FIXED[] {const unsigned long long *source};
-    %apply long long FIXED[] {const long long *source};
-    %apply float FIXED[] {const float *source};
-    %apply double FIXED[] {const double *source};
-
     %exception tensor(const std::string& path) %{
         try{
             $action
@@ -139,7 +139,7 @@ public:
     %rename(Save) save;
     void save(const std::string& path, endian_t endian = endian_t::NATIVE);
 
-    %exception copy_from(const T* source, size_t nitems) %{
+    %exception copy_from(const std::vector<T>& source) %{
         try{
             $action
         } catch (std::invalid_argument& e){
@@ -150,7 +150,7 @@ public:
 
     %csmethodmodifiers copy_from "public unsafe override";
     %rename(CopyFrom) copy_from;
-    void copy_from(const T* source, size_t itemCount);
+    void copy_from(const std::vector<T>& source);
 
     %csmethodmodifiers values "protected override"
     %rename(getValues) values;
@@ -223,6 +223,8 @@ public:
 %template(Float32Tensor) tensor<float>;
 %typemap(csbase) SWIGTYPE "Tensor<double, Float64Buffer>";
 %template(Float64Tensor) tensor<double>;
+%typemap(csbase) SWIGTYPE "Tensor<string, UnicodeStringBuffer>";
+%template(UnicodeStringTensor) tensor<std::wstring>;
 
 %typemap(csbase) SWIGTYPE ""
 
@@ -261,6 +263,7 @@ public:
     %template(Write) write<long long>;
     %template(Write) write<float>;
     %template(Write) write<double>;
+    %template(Write) write<std::wstring>;
 };
 
 %rename(NPZInputStream) inpzstream;
@@ -329,4 +332,5 @@ public:
     %template(ReadInt64) read<long long>;
     %template(ReadFloat32) read<float>;
     %template(ReadFloat64) read<double>;
+    %template(ReadUnicodeString) read<std::wstring>;
 };
diff --git a/CSharpWrapper/Tensor.cs b/CSharpWrapper/Tensor.cs
@@ -22,12 +22,11 @@ namespace NumpyIO
     public abstract class Tensor<T, B> where B : IList<T>
     {
         /// <summary>
-        /// Copy the data from the provided array. These values will
+        /// Copy the data from the provided buffer. These values will
         /// be copied into the underlying C++ type.
         /// </summary>
-        /// <param name="source">The source array</param>
-        /// <param name="nitems">The number of items to copy</param>
-        public abstract void CopyFrom(T[] source, uint nitems);
+        /// <param name="source">The source buffer</param>
+        public abstract void CopyFrom(B source);
 
         /// <summary>
         /// Save the tensor to the provided location on the disk.

diff --git a/RELEASE_NOTES b/RELEASE_NOTES
@@ -1,2 +1,5 @@
-Improvements:
-- Bug fix for a missing comma on 1d shape
+New Features:
+- Support for Unicode string tensors (npy type 'U')
+
+Breaking change:
+- `CopyFrom` interface for C# Tensors has been changed to use *Buffer objects
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.2.2
+1.3.0
diff --git a/assets/test/test.npz b/assets/test/test.npz
diff --git a/assets/test/test_compressed.npz b/assets/test/test_compressed.npz
diff --git a/assets/test/unicode.npy b/assets/test/unicode.npy
diff --git a/include/npy/core.h b/include/npy/core.h
@@ -66,7 +66,9 @@ enum class data_type_t : std::uint8_t
   /** 32-bit floating point value (float) */
   FLOAT32,
   /** 64-bit floating point value (double) */
-  FLOAT64
+  FLOAT64,
+  /** Unicode string (std::wstring) */
+  UNICODE_STRING
 };
 
 /** Convert a data type and endianness to a NPY dtype string.

diff --git a/include/npy/npy.h b/include/npy/npy.h
@@ -56,6 +56,9 @@ struct header_info
 
     /** A vector of values indicating the shape of each dimension of the tensor. */
     std::vector<size_t> shape;
+
+    /** Value used to indicate the maximum length of an element (used by Unicode strings) */
+    std::size_t max_element_length;
 };
 
 /** Writes an NPY header to the provided stream.
@@ -110,6 +113,25 @@ void write_npy_header(std::basic_ostream<CHAR> &output,
     output.write(reinterpret_cast<const CHAR *>(end.data()), end.length());
 }
 
+template<typename T, typename CHAR>
+void copy_to(const T* data_ptr, std::size_t num_elements, std::basic_ostream<CHAR>& output, npy::endian_t endianness)
+{
+    if (endianness == npy::endian_t::NATIVE || endianness == native_endian())
+    {
+        output.write(reinterpret_cast<const CHAR *>(data_ptr), num_elements * sizeof(T));
+    }
+    else
+    {
+        CHAR buffer[sizeof(T)];
+        for (auto curr = data_ptr; curr < data_ptr + num_elements; ++curr)
+        {
+            const CHAR *start = reinterpret_cast<const CHAR *>(curr);
+            std::reverse_copy(start, start + sizeof(T), buffer);
+            output.write(buffer, sizeof(T));
+        }
+    }    
+}
+
 /** Saves a tensor to the provided stream.
  *  \tparam T the data type
  *  \tparam TENSOR the tensor type.
@@ -120,32 +142,72 @@ void write_npy_header(std::basic_ostream<CHAR> &output,
  */
 template <typename T,
           template <typename> class TENSOR,
-          typename CHAR>
+          typename CHAR,
+          std::enable_if_t<!std::is_same<std::wstring, T>::value, int> = 42>
 void save(std::basic_ostream<CHAR> &output,
           const TENSOR<T> &tensor,
           endian_t endianness = npy::endian_t::NATIVE)
 {
     auto dtype = to_dtype(tensor.dtype(), endianness);
     write_npy_header(output, dtype, tensor.fortran_order(), tensor.shape());
+    copy_to(tensor.data(), tensor.size(), output, endianness);
+};
 
-    if (endianness == npy::endian_t::NATIVE ||
-        endianness == native_endian() ||
-        dtype[0] == '|')
+/** Saves a unicode string tensor to the provided stream.
+ *  \tparam TENSOR the tensor type.
+ *  \param output the output stream
+ *  \param tensor the tensor
+ *  \param endianness the endianness to use in saving the tensor
+ *  \sa npy::tensor
+ */
+template <typename T,
+          template <typename> class TENSOR,
+          typename CHAR,
+          std::enable_if_t<std::is_same<std::wstring, T>::value, int> = 42>
+void save(std::basic_ostream<CHAR> &output,
+          const TENSOR<std::wstring> &tensor,
+          endian_t endianness = npy::endian_t::NATIVE)
+{
+    std::size_t max_length = 0;
+    for(const auto& element : tensor)
     {
-        output.write(reinterpret_cast<const CHAR *>(tensor.data()), tensor.size() * sizeof(T));
+        if(element.size() > max_length)
+        {
+            max_length = element.size();
+        }
     }
-    else
+
+    if(endianness == npy::endian_t::NATIVE)
     {
-        CHAR buffer[sizeof(T)];
-        for (auto curr = tensor.data(); curr < tensor.data() + tensor.size(); ++curr)
+        endianness = native_endian();
+    }
+
+    std::string dtype = ">U" + std::to_string(max_length);
+    if(endianness == npy::endian_t::LITTLE)
+    {
+        dtype = "<U" + std::to_string(max_length);
+    }
+
+    write_npy_header(output, dtype, tensor.fortran_order(), tensor.shape());
+
+    std::vector<std::int32_t> unicode(tensor.size() * max_length, 0);
+    auto word_start = unicode.begin();
+    for(const auto& element : tensor)
+    {
+        auto char_it = word_start;
+        for(const auto& wchar : element)
         {
-            const CHAR *start = reinterpret_cast<const CHAR *>(curr);
-            std::reverse_copy(start, start + sizeof(T), buffer);
-            output.write(buffer, sizeof(T));
+            *char_it = static_cast<std::int32_t>(wchar);
+            char_it += 1;
         }
+
+        word_start += max_length;
     }
+
+    copy_to(unicode.data(), unicode.size(), output, endianness);
 };
 
+
 /** Saves a tensor to the provided location on disk.
  *  \tparam T the data type
  *  \tparam TENSOR the tensor type.
@@ -166,7 +228,7 @@ void save(const std::string &path,
         throw std::invalid_argument("path");
     }
 
-    save(output, tensor, endianness);
+    save<T, TENSOR, char>(output, tensor, endianness);
 };
 
 /** Read an NPY header from the provided stream.
@@ -202,6 +264,26 @@ header_info read_npy_header(std::basic_istream<CHAR> &input)
     return header_info(dictionary);
 }
 
+template <typename T, typename CHAR>
+void copy_to(std::basic_istream<CHAR> &input, T* data_ptr, std::size_t num_elements, npy::endian_t endianness)
+{
+    if (endianness == npy::endian_t::NATIVE || endianness == native_endian())
+    {
+        CHAR *start = reinterpret_cast<CHAR *>(data_ptr);
+        input.read(start, num_elements * sizeof(T));
+    }
+    else
+    {
+        CHAR buffer[sizeof(T)];
+        for (auto curr = data_ptr; curr < data_ptr + num_elements; ++curr)
+        {
+            input.read(buffer, sizeof(T));
+            CHAR *start = reinterpret_cast<CHAR *>(curr);
+            std::reverse_copy(buffer, buffer + sizeof(T), start);
+        }
+    }    
+}
+
 /** Loads a tensor in NPY format from the provided stream. The type of the tensor
  *  must match the data to be read.
  *  \tparam T the data type
@@ -212,7 +294,8 @@ header_info read_npy_header(std::basic_istream<CHAR> &input)
  */
 template <typename T,
           template <typename> class TENSOR,
-          typename CHAR>
+          typename CHAR,
+          std::enable_if_t<!std::is_same<std::wstring, T>::value, int> = 42>
 TENSOR<T> load(std::basic_istream<CHAR> &input)
 {
     header_info info = read_npy_header(input);
@@ -222,20 +305,45 @@ TENSOR<T> load(std::basic_istream<CHAR> &input)
         throw std::logic_error("requested dtype does not match stream's dtype");
     }
 
-    if (info.endianness == npy::endian_t::NATIVE || info.endianness == native_endian())
+    copy_to(input, tensor.data(), tensor.size(), info.endianness);
+    return tensor;
+}
+
+
+/** Loads a unicode string tensor in NPY format from the provided stream. The type of the tensor
+ *  must match the data to be read.
+ *  \tparam T the data type
+ *  \tparam TENSOR the tensor type
+ *  \param input the input stream
+ *  \return an object of type TENSOR<T> read from the stream
+ *  \sa npy::tensor
+ */
+template <typename T,
+          template <typename> class TENSOR,
+          typename CHAR,
+          std::enable_if_t<std::is_same<std::wstring, T>::value, int> = 42>
+TENSOR<T> load(std::basic_istream<CHAR> &input)
+{
+    header_info info = read_npy_header(input);
+    TENSOR<T> tensor(info.shape, info.fortran_order);
+    if (info.dtype != tensor.dtype())
     {
-        CHAR *start = reinterpret_cast<CHAR *>(tensor.data());
-        input.read(start, tensor.size() * sizeof(T));
+        throw std::logic_error("requested dtype does not match stream's dtype");
     }
-    else
+
+    std::vector<std::int32_t> unicode(tensor.size() * info.max_element_length, 0);
+    copy_to(input, unicode.data(), unicode.size(), info.endianness);
+
+    auto word_start = unicode.begin();
+    for(auto& element : tensor)
     {
-        CHAR buffer[sizeof(T)];
-        for (auto curr = tensor.data(); curr < tensor.data() + tensor.size(); ++curr)
+        auto char_it = word_start;
+        for(std::size_t i=0; i<info.max_element_length && *char_it > 0; ++i, ++char_it)
         {
-            input.read(buffer, sizeof(T));
-            CHAR *start = reinterpret_cast<CHAR *>(curr);
-            std::reverse_copy(buffer, buffer + sizeof(T), start);
+            element.push_back(static_cast<wchar_t>(*char_it));
         }
+
+        word_start += info.max_element_length;
     }
 
     return tensor;

diff --git a/include/npy/npz.h b/include/npy/npz.h
@@ -95,7 +95,7 @@ class onpzstream
         }
 
         omemstream output;
-        save(output, tensor);
+        save<T, TENSOR, omemstream::char_type>(output, tensor);
 
         std::string suffix = ".npy";
         std::string name = filename;