Skip to content

Commit

Permalink
Adding Unicode support (#21)
Browse files Browse the repository at this point in the history
* Adding unicode

* All tests passing

* Bumping the version

* Adding C# + adding tests

* Adding to release notes
  • Loading branch information
matajoh authored Feb 11, 2021
1 parent 8609cd2 commit 4f19e7d
Show file tree
Hide file tree
Showing 21 changed files with 287 additions and 51 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Changelog

## [2021-02-10 - Version 1.3.0](https://github.com/matajoh/libnpy/releases/tag/v1.3.0)

New Features:
- Support for Unicode string tensors (npy type 'U')

Breaking change:
- `CopyFrom` interface for C# Tensors has been changed to use *Buffer objects

## [2021-02-09 - Version 1.2.2](https://github.com/matajoh/libnpy/releases/tag/v1.2.2)

Improvements:
Expand Down
32 changes: 18 additions & 14 deletions CSharpWrapper/NumpyIONative.i
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

%include "std_vector.i"
%include "std_string.i"
%include "std_wstring.i"
%include "stdint.i"
%include "arrays_csharp.i"
%include "typemaps.i"
Expand All @@ -25,7 +26,8 @@ enum class data_type_t : std::uint8_t {
INT64,
UINT64,
FLOAT32,
FLOAT64
FLOAT64,
UNICODE_STRING
};

%rename(Endian) endian_t;
Expand All @@ -43,6 +45,13 @@ enum class compression_method_t : std::uint16_t {
DEFLATED = 8
};

%typemap(ctype, out="void *") const wstring * "wchar_t *"
%typemap(imtype,
inattributes="[global::System.Runtime.InteropServices.MarshalAs(UnmanagedType.LPArray, ArraySubType=UnmanagedType.LPStr)]",
outattributes="[return: global::System.Runtime.InteropServices.MarshalAs(UnmanagedType.LPArray, ArraySubType=UnmanagedType.LPStr)]"
) const wstring * "string[]"
%typemap(cstype) const wstring * "string[]"

%template(UInt8Buffer) std::vector<unsigned char>;
%template(Int8Buffer) std::vector<signed char>;
%template(UInt16Buffer) std::vector<unsigned short>;
Expand All @@ -53,6 +62,8 @@ enum class compression_method_t : std::uint16_t {
%template(Int64Buffer) std::vector<long long>;
%template(Float32Buffer) std::vector<float>;
%template(Float64Buffer) std::vector<double>;
%apply const std::wstring & {std::wstring &};
%template(UnicodeStringBuffer) std::vector<std::wstring>;

%template(Shape) std::vector<size_t>;

Expand Down Expand Up @@ -96,17 +107,6 @@ header_info peek(const std::string& path);
template <typename T>
class tensor {
public:
%apply unsigned char FIXED[] {const unsigned char *source};
%apply signed char FIXED[] {const signed char *source};
%apply unsigned short FIXED[] {const unsigned short *source};
%apply short FIXED[] {const short *source};
%apply unsigned int FIXED[] {const unsigned int *source};
%apply int FIXED[] {const int *source};
%apply unsigned long long FIXED[] {const unsigned long long *source};
%apply long long FIXED[] {const long long *source};
%apply float FIXED[] {const float *source};
%apply double FIXED[] {const double *source};

%exception tensor(const std::string& path) %{
try{
$action
Expand Down Expand Up @@ -139,7 +139,7 @@ public:
%rename(Save) save;
void save(const std::string& path, endian_t endian = endian_t::NATIVE);

%exception copy_from(const T* source, size_t nitems) %{
%exception copy_from(const std::vector<T>& source) %{
try{
$action
} catch (std::invalid_argument& e){
Expand All @@ -150,7 +150,7 @@ public:

%csmethodmodifiers copy_from "public unsafe override";
%rename(CopyFrom) copy_from;
void copy_from(const T* source, size_t itemCount);
void copy_from(const std::vector<T>& source);

%csmethodmodifiers values "protected override"
%rename(getValues) values;
Expand Down Expand Up @@ -223,6 +223,8 @@ public:
%template(Float32Tensor) tensor<float>;
%typemap(csbase) SWIGTYPE "Tensor<double, Float64Buffer>";
%template(Float64Tensor) tensor<double>;
%typemap(csbase) SWIGTYPE "Tensor<string, UnicodeStringBuffer>";
%template(UnicodeStringTensor) tensor<std::wstring>;

%typemap(csbase) SWIGTYPE ""

Expand Down Expand Up @@ -261,6 +263,7 @@ public:
%template(Write) write<long long>;
%template(Write) write<float>;
%template(Write) write<double>;
%template(Write) write<std::wstring>;
};

%rename(NPZInputStream) inpzstream;
Expand Down Expand Up @@ -329,4 +332,5 @@ public:
%template(ReadInt64) read<long long>;
%template(ReadFloat32) read<float>;
%template(ReadFloat64) read<double>;
%template(ReadUnicodeString) read<std::wstring>;
};
7 changes: 3 additions & 4 deletions CSharpWrapper/Tensor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,11 @@ namespace NumpyIO
public abstract class Tensor<T, B> where B : IList<T>
{
/// <summary>
/// Copy the data from the provided array. These values will
/// Copy the data from the provided buffer. These values will
/// be copied into the underlying C++ type.
/// </summary>
/// <param name="source">The source array</param>
/// <param name="nitems">The number of items to copy</param>
public abstract void CopyFrom(T[] source, uint nitems);
/// <param name="source">The source buffer</param>
public abstract void CopyFrom(B source);

/// <summary>
/// Save the tensor to the provided location on the disk.
Expand Down
7 changes: 5 additions & 2 deletions RELEASE_NOTES
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
Improvements:
- Bug fix for a missing comma on 1d shape
New Features:
- Support for Unicode string tensors (npy type 'U')

Breaking change:
- `CopyFrom` interface for C# Tensors has been changed to use *Buffer objects
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.2.2
1.3.0
Binary file modified assets/test/test.npz
Binary file not shown.
Binary file modified assets/test/test_compressed.npz
Binary file not shown.
Binary file added assets/test/unicode.npy
Binary file not shown.
4 changes: 3 additions & 1 deletion include/npy/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,9 @@ enum class data_type_t : std::uint8_t
/** 32-bit floating point value (float) */
FLOAT32,
/** 64-bit floating point value (double) */
FLOAT64
FLOAT64,
/** Unicode string (std::wstring) */
UNICODE_STRING
};

/** Convert a data type and endianness to a NPY dtype string.
Expand Down
152 changes: 130 additions & 22 deletions include/npy/npy.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ struct header_info

/** A vector of values indicating the shape of each dimension of the tensor. */
std::vector<size_t> shape;

/** Value used to indicate the maximum length of an element (used by Unicode strings) */
std::size_t max_element_length;
};

/** Writes an NPY header to the provided stream.
Expand Down Expand Up @@ -110,6 +113,25 @@ void write_npy_header(std::basic_ostream<CHAR> &output,
output.write(reinterpret_cast<const CHAR *>(end.data()), end.length());
}

template<typename T, typename CHAR>
void copy_to(const T* data_ptr, std::size_t num_elements, std::basic_ostream<CHAR>& output, npy::endian_t endianness)
{
if (endianness == npy::endian_t::NATIVE || endianness == native_endian())
{
output.write(reinterpret_cast<const CHAR *>(data_ptr), num_elements * sizeof(T));
}
else
{
CHAR buffer[sizeof(T)];
for (auto curr = data_ptr; curr < data_ptr + num_elements; ++curr)
{
const CHAR *start = reinterpret_cast<const CHAR *>(curr);
std::reverse_copy(start, start + sizeof(T), buffer);
output.write(buffer, sizeof(T));
}
}
}

/** Saves a tensor to the provided stream.
* \tparam T the data type
* \tparam TENSOR the tensor type.
Expand All @@ -120,32 +142,72 @@ void write_npy_header(std::basic_ostream<CHAR> &output,
*/
template <typename T,
template <typename> class TENSOR,
typename CHAR>
typename CHAR,
std::enable_if_t<!std::is_same<std::wstring, T>::value, int> = 42>
void save(std::basic_ostream<CHAR> &output,
const TENSOR<T> &tensor,
endian_t endianness = npy::endian_t::NATIVE)
{
auto dtype = to_dtype(tensor.dtype(), endianness);
write_npy_header(output, dtype, tensor.fortran_order(), tensor.shape());
copy_to(tensor.data(), tensor.size(), output, endianness);
};

if (endianness == npy::endian_t::NATIVE ||
endianness == native_endian() ||
dtype[0] == '|')
/** Saves a unicode string tensor to the provided stream.
* \tparam TENSOR the tensor type.
* \param output the output stream
* \param tensor the tensor
* \param endianness the endianness to use in saving the tensor
* \sa npy::tensor
*/
template <typename T,
template <typename> class TENSOR,
typename CHAR,
std::enable_if_t<std::is_same<std::wstring, T>::value, int> = 42>
void save(std::basic_ostream<CHAR> &output,
const TENSOR<std::wstring> &tensor,
endian_t endianness = npy::endian_t::NATIVE)
{
std::size_t max_length = 0;
for(const auto& element : tensor)
{
output.write(reinterpret_cast<const CHAR *>(tensor.data()), tensor.size() * sizeof(T));
if(element.size() > max_length)
{
max_length = element.size();
}
}
else

if(endianness == npy::endian_t::NATIVE)
{
CHAR buffer[sizeof(T)];
for (auto curr = tensor.data(); curr < tensor.data() + tensor.size(); ++curr)
endianness = native_endian();
}

std::string dtype = ">U" + std::to_string(max_length);
if(endianness == npy::endian_t::LITTLE)
{
dtype = "<U" + std::to_string(max_length);
}

write_npy_header(output, dtype, tensor.fortran_order(), tensor.shape());

std::vector<std::int32_t> unicode(tensor.size() * max_length, 0);
auto word_start = unicode.begin();
for(const auto& element : tensor)
{
auto char_it = word_start;
for(const auto& wchar : element)
{
const CHAR *start = reinterpret_cast<const CHAR *>(curr);
std::reverse_copy(start, start + sizeof(T), buffer);
output.write(buffer, sizeof(T));
*char_it = static_cast<std::int32_t>(wchar);
char_it += 1;
}

word_start += max_length;
}

copy_to(unicode.data(), unicode.size(), output, endianness);
};


/** Saves a tensor to the provided location on disk.
* \tparam T the data type
* \tparam TENSOR the tensor type.
Expand All @@ -166,7 +228,7 @@ void save(const std::string &path,
throw std::invalid_argument("path");
}

save(output, tensor, endianness);
save<T, TENSOR, char>(output, tensor, endianness);
};

/** Read an NPY header from the provided stream.
Expand Down Expand Up @@ -202,6 +264,26 @@ header_info read_npy_header(std::basic_istream<CHAR> &input)
return header_info(dictionary);
}

template <typename T, typename CHAR>
void copy_to(std::basic_istream<CHAR> &input, T* data_ptr, std::size_t num_elements, npy::endian_t endianness)
{
if (endianness == npy::endian_t::NATIVE || endianness == native_endian())
{
CHAR *start = reinterpret_cast<CHAR *>(data_ptr);
input.read(start, num_elements * sizeof(T));
}
else
{
CHAR buffer[sizeof(T)];
for (auto curr = data_ptr; curr < data_ptr + num_elements; ++curr)
{
input.read(buffer, sizeof(T));
CHAR *start = reinterpret_cast<CHAR *>(curr);
std::reverse_copy(buffer, buffer + sizeof(T), start);
}
}
}

/** Loads a tensor in NPY format from the provided stream. The type of the tensor
* must match the data to be read.
* \tparam T the data type
Expand All @@ -212,7 +294,8 @@ header_info read_npy_header(std::basic_istream<CHAR> &input)
*/
template <typename T,
template <typename> class TENSOR,
typename CHAR>
typename CHAR,
std::enable_if_t<!std::is_same<std::wstring, T>::value, int> = 42>
TENSOR<T> load(std::basic_istream<CHAR> &input)
{
header_info info = read_npy_header(input);
Expand All @@ -222,20 +305,45 @@ TENSOR<T> load(std::basic_istream<CHAR> &input)
throw std::logic_error("requested dtype does not match stream's dtype");
}

if (info.endianness == npy::endian_t::NATIVE || info.endianness == native_endian())
copy_to(input, tensor.data(), tensor.size(), info.endianness);
return tensor;
}


/** Loads a unicode string tensor in NPY format from the provided stream. The type of the tensor
* must match the data to be read.
* \tparam T the data type
* \tparam TENSOR the tensor type
* \param input the input stream
* \return an object of type TENSOR<T> read from the stream
* \sa npy::tensor
*/
template <typename T,
template <typename> class TENSOR,
typename CHAR,
std::enable_if_t<std::is_same<std::wstring, T>::value, int> = 42>
TENSOR<T> load(std::basic_istream<CHAR> &input)
{
header_info info = read_npy_header(input);
TENSOR<T> tensor(info.shape, info.fortran_order);
if (info.dtype != tensor.dtype())
{
CHAR *start = reinterpret_cast<CHAR *>(tensor.data());
input.read(start, tensor.size() * sizeof(T));
throw std::logic_error("requested dtype does not match stream's dtype");
}
else

std::vector<std::int32_t> unicode(tensor.size() * info.max_element_length, 0);
copy_to(input, unicode.data(), unicode.size(), info.endianness);

auto word_start = unicode.begin();
for(auto& element : tensor)
{
CHAR buffer[sizeof(T)];
for (auto curr = tensor.data(); curr < tensor.data() + tensor.size(); ++curr)
auto char_it = word_start;
for(std::size_t i=0; i<info.max_element_length && *char_it > 0; ++i, ++char_it)
{
input.read(buffer, sizeof(T));
CHAR *start = reinterpret_cast<CHAR *>(curr);
std::reverse_copy(buffer, buffer + sizeof(T), start);
element.push_back(static_cast<wchar_t>(*char_it));
}

word_start += info.max_element_length;
}

return tensor;
Expand Down
2 changes: 1 addition & 1 deletion include/npy/npz.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ class onpzstream
}

omemstream output;
save(output, tensor);
save<T, TENSOR, omemstream::char_type>(output, tensor);

std::string suffix = ".npy";
std::string name = filename;
Expand Down
Loading

0 comments on commit 4f19e7d

Please sign in to comment.