• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2    Copyright 2017 Leon Merten Lohse
3 
4    Permission is hereby granted, free of charge, to any person obtaining a copy
5    of this software and associated documentation files (the "Software"), to deal
6    in the Software without restriction, including without limitation the rights
7    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8    copies of the Software, and to permit persons to whom the Software is
9    furnished to do so, subject to the following conditions:
10 
11    The above copyright notice and this permission notice shall be included in
12    all copies or substantial portions of the Software.
13 
14    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20    SOFTWARE.
21 */
22 
23 #ifndef NPY_H
24 #define NPY_H
25 
26 #include <complex>
27 #include <fstream>
28 #include <string>
29 #include <iostream>
30 #include <sstream>
31 #include <cstdint>
32 #include <cstring>
33 #include <vector>
34 #include <stdexcept>
35 #include <algorithm>
36 #include <regex>
37 #include <unordered_map>
38 
39 
40 namespace npy {
41 
42 /* Compile-time test for byte order.
43    If your compiler does not define these per default, you may want to define
44    one of these constants manually.
45    Defaults to little endian order. */
46 #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || \
47     defined(__BIG_ENDIAN__) || \
48     defined(__ARMEB__) || \
49     defined(__THUMBEB__) || \
50     defined(__AARCH64EB__) || \
51     defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
52 const bool big_endian = true;
53 #else
54 const bool big_endian = false;
55 #endif
56 
57 
58 const char magic_string[] = "\x93NUMPY";
59 const size_t magic_string_length = 6;
60 
61 const char little_endian_char = '<';
62 const char big_endian_char = '>';
63 const char no_endian_char = '|';
64 
65 constexpr char host_endian_char = ( big_endian ?
66     big_endian_char :
67     little_endian_char );
68 
69 /* npy array length */
70 typedef unsigned long int ndarray_len_t;
71 
write_magic(std::ostream & ostream,unsigned char v_major=1,unsigned char v_minor=0)72 inline void write_magic(std::ostream& ostream, unsigned char v_major=1, unsigned char v_minor=0) {
73   ostream.write(magic_string, magic_string_length);
74   ostream.put(v_major);
75   ostream.put(v_minor);
76 }
77 
read_magic(std::istream & istream,unsigned char & v_major,unsigned char & v_minor)78 inline void read_magic(std::istream& istream, unsigned char& v_major, unsigned char& v_minor) {
79   char buf[magic_string_length+2];
80   istream.read(buf, magic_string_length+2);
81 
82   if(!istream) {
83     throw std::runtime_error("io error: failed reading file");
84   }
85 
86   if (0 != std::memcmp(buf, magic_string, magic_string_length))
87     throw std::runtime_error("this file does not have a valid npy format.");
88 
89   v_major = buf[magic_string_length];
90   v_minor = buf[magic_string_length+1];
91 }
92 
93 // typestring magic
94 struct Typestring {
95   private:
96     char c_endian;
97     char c_type;
98     int  len;
99 
100   public:
strnpy::Typestring101     inline std::string str() {
102       const size_t max_buflen = 16;
103       char buf[max_buflen];
104       std::sprintf(buf, "%c%c%u", c_endian, c_type, len);
105       return std::string(buf);
106     }
107 
Typestringnpy::Typestring108     Typestring(const std::vector<float>& v)
109       :c_endian {host_endian_char}, c_type {'f'}, len {sizeof(float)} {}
Typestringnpy::Typestring110     Typestring(const std::vector<double>& v)
111       :c_endian {host_endian_char}, c_type {'f'}, len {sizeof(double)} {}
Typestringnpy::Typestring112     Typestring(const std::vector<long double>& v)
113       :c_endian {host_endian_char}, c_type {'f'}, len {sizeof(long double)} {}
114 
Typestringnpy::Typestring115     Typestring(const std::vector<char>& v)
116       :c_endian {no_endian_char}, c_type {'i'}, len {sizeof(char)} {}
Typestringnpy::Typestring117     Typestring(const std::vector<short>& v)
118       :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(short)} {}
Typestringnpy::Typestring119     Typestring(const std::vector<int>& v)
120       :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(int)} {}
Typestringnpy::Typestring121     Typestring(const std::vector<long>& v)
122       :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(long)} {}
Typestringnpy::Typestring123     Typestring(const std::vector<long long>& v) :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(long long)} {}
124 
Typestringnpy::Typestring125     Typestring(const std::vector<unsigned char>& v)
126       :c_endian {no_endian_char}, c_type {'u'}, len {sizeof(unsigned char)} {}
Typestringnpy::Typestring127     Typestring(const std::vector<unsigned short>& v)
128       :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned short)} {}
Typestringnpy::Typestring129     Typestring(const std::vector<unsigned int>& v)
130       :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned int)} {}
Typestringnpy::Typestring131     Typestring(const std::vector<unsigned long>& v)
132       :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned long)} {}
Typestringnpy::Typestring133     Typestring(const std::vector<unsigned long long>& v)
134       :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned long long)} {}
135 
Typestringnpy::Typestring136     Typestring(const std::vector<std::complex<float>>& v)
137       :c_endian {host_endian_char}, c_type {'c'}, len {sizeof(std::complex<float>)} {}
Typestringnpy::Typestring138     Typestring(const std::vector<std::complex<double>>& v)
139       :c_endian {host_endian_char}, c_type {'c'}, len {sizeof(std::complex<double>)} {}
Typestringnpy::Typestring140     Typestring(const std::vector<std::complex<long double>>& v)
141       :c_endian {host_endian_char}, c_type {'c'}, len {sizeof(std::complex<long double>)} {}
142 };
143 
parse_typestring(std::string typestring)144 inline void parse_typestring( std::string typestring){
145   std::regex re ("'([<>|])([ifuc])(\\d+)'");
146   std::smatch sm;
147 
148   std::regex_match(typestring, sm, re );
149 
150   if ( sm.size() != 4 ) {
151     throw std::runtime_error("invalid typestring");
152   }
153 }
154 
155 namespace pyparse {
156 
157 /**
158   Removes leading and trailing whitespaces
159   */
trim(const std::string & str)160 inline std::string trim(const std::string& str) {
161   const std::string whitespace = " \t";
162   auto begin = str.find_first_not_of(whitespace);
163 
164   if (begin == std::string::npos)
165     return "";
166 
167   auto end = str.find_last_not_of(whitespace);
168 
169   return str.substr(begin, end-begin+1);
170 }
171 
172 
get_value_from_map(const std::string & mapstr)173 inline std::string get_value_from_map(const std::string& mapstr) {
174   size_t sep_pos = mapstr.find_first_of(":");
175   if (sep_pos == std::string::npos)
176     return "";
177 
178   std::string tmp = mapstr.substr(sep_pos+1);
179   return trim(tmp);
180 }
181 
182 /**
183    Parses the string representation of a Python dict
184 
185    The keys need to be known and may not appear anywhere else in the data.
186  */
parse_dict(std::string in,std::vector<std::string> & keys)187 inline std::unordered_map<std::string, std::string> parse_dict(std::string in, std::vector<std::string>& keys) {
188 
189   std::unordered_map<std::string, std::string> map;
190 
191   if (keys.size() == 0)
192     return map;
193 
194   in = trim(in);
195 
196   // unwrap dictionary
197   if ((in.front() == '{') && (in.back() == '}'))
198     in = in.substr(1, in.length()-2);
199   else
200     throw std::runtime_error("Not a Python dictionary.");
201 
202   std::vector<std::pair<size_t, std::string>> positions;
203 
204   for (auto const& value : keys) {
205     size_t pos = in.find( "'" + value + "'" );
206 
207     if (pos == std::string::npos)
208       throw std::runtime_error("Missing '"+value+"' key.");
209 
210     std::pair<size_t, std::string> position_pair { pos, value };
211     positions.push_back(position_pair);
212   }
213 
214   // sort by position in dict
215   std::sort(positions.begin(), positions.end() );
216 
217   for(size_t i = 0; i < positions.size(); ++i) {
218     std::string raw_value;
219     size_t begin { positions[i].first };
220     size_t end { std::string::npos };
221 
222     std::string key = positions[i].second;
223 
224     if ( i+1 < positions.size() )
225       end = positions[i+1].first;
226 
227     raw_value = in.substr(begin, end-begin);
228 
229     raw_value = trim(raw_value);
230 
231     if (raw_value.back() == ',')
232       raw_value.pop_back();
233 
234     map[key] = get_value_from_map(raw_value);
235   }
236 
237   return map;
238 }
239 
240 /**
241   Parses the string representation of a Python boolean
242   */
parse_bool(const std::string & in)243 inline bool parse_bool(const std::string& in) {
244   if (in == "True")
245     return true;
246   if (in == "False")
247     return false;
248 
249   throw std::runtime_error("Invalid python boolan.");
250 }
251 
252 /**
253   Parses the string representation of a Python str
254   */
parse_str(const std::string & in)255 inline std::string parse_str(const std::string& in) {
256   if ((in.front() == '\'') && (in.back() == '\''))
257     return in.substr(1, in.length()-2);
258 
259   throw std::runtime_error("Invalid python string.");
260 }
261 
262 /**
263   Parses the string represenatation of a Python tuple into a vector of its items
264  */
parse_tuple(std::string in)265 inline std::vector<std::string> parse_tuple(std::string in) {
266   std::vector<std::string> v;
267   const char seperator = ',';
268 
269   in = trim(in);
270 
271   if ((in.front() == '(') && (in.back() == ')'))
272     in = in.substr(1, in.length()-2);
273   else
274     throw std::runtime_error("Invalid Python tuple.");
275 
276   std::istringstream iss(in);
277 
278   for (std::string token; std::getline(iss, token, seperator);) {
279       v.push_back(token);
280   }
281 
282   return v;
283 }
284 
285 template <typename T>
write_tuple(const std::vector<T> & v)286 inline std::string write_tuple(const std::vector<T>& v) {
287   if (v.size() == 0)
288     return "";
289 
290   std::ostringstream ss;
291 
292   if (v.size() == 1) {
293     ss << "(" << v.front() << ",)";
294   } else {
295     const std::string delimiter = ", ";
296     // v.size() > 1
297     ss << "(";
298     std::copy(v.begin(), v.end()-1, std::ostream_iterator<T>(ss, delimiter.c_str()));
299     ss << v.back();
300     ss << ")";
301   }
302 
303   return ss.str();
304 }
305 
write_boolean(bool b)306 inline std::string write_boolean(bool b) {
307   if(b)
308     return "True";
309   else
310     return "False";
311 }
312 
313 } // namespace pyparse
314 
315 
parse_header(std::string header,std::string & descr,bool & fortran_order,std::vector<ndarray_len_t> & shape)316 inline void parse_header(std::string header, std::string& descr, bool& fortran_order, std::vector<ndarray_len_t>& shape) {
317   /*
318      The first 6 bytes are a magic string: exactly "x93NUMPY".
319      The next 1 byte is an unsigned byte: the major version number of the file format, e.g. x01.
320      The next 1 byte is an unsigned byte: the minor version number of the file format, e.g. x00. Note: the version of the file format is not tied to the version of the numpy package.
321      The next 2 bytes form a little-endian unsigned short int: the length of the header data HEADER_LEN.
322      The next HEADER_LEN bytes form the header data describing the array's format. It is an ASCII string which contains a Python literal expression of a dictionary. It is terminated by a newline ('n') and padded with spaces ('x20') to make the total length of the magic string + 4 + HEADER_LEN be evenly divisible by 16 for alignment purposes.
323      The dictionary contains three keys:
324 
325      "descr" : dtype.descr
326      An object that can be passed as an argument to the numpy.dtype() constructor to create the array's dtype.
327      "fortran_order" : bool
328      Whether the array data is Fortran-contiguous or not. Since Fortran-contiguous arrays are a common form of non-C-contiguity, we allow them to be written directly to disk for efficiency.
329      "shape" : tuple of int
330      The shape of the array.
331      For repeatability and readability, this dictionary is formatted using pprint.pformat() so the keys are in alphabetic order.
332    */
333 
334   // remove trailing newline
335   if (header.back() != '\n')
336     throw std::runtime_error("invalid header");
337   header.pop_back();
338 
339   // parse the dictionary
340   std::vector<std::string> keys { "descr", "fortran_order", "shape" };
341   auto dict_map = npy::pyparse::parse_dict(header, keys);
342 
343   if (dict_map.size() == 0)
344     throw std::runtime_error("invalid dictionary in header");
345 
346   std::string descr_s = dict_map["descr"];
347   std::string fortran_s = dict_map["fortran_order"];
348   std::string shape_s = dict_map["shape"];
349 
350   // TODO: extract info from typestring
351   parse_typestring(descr_s);
352   // remove
353   descr = npy::pyparse::parse_str(descr_s);
354 
355   // convert literal Python bool to C++ bool
356   fortran_order = npy::pyparse::parse_bool(fortran_s);
357 
358   // parse the shape tuple
359   auto shape_v = npy::pyparse::parse_tuple(shape_s);
360   if (shape_v.size() == 0)
361     throw std::runtime_error("invalid shape tuple in header");
362 
363   for ( auto item : shape_v ) {
364     std::stringstream stream(item);
365     unsigned long value;
366     stream >> value;
367     ndarray_len_t dim = static_cast<ndarray_len_t>(value);
368     shape.push_back(dim);
369   }
370 }
371 
372 
write_header_dict(const std::string & descr,bool fortran_order,const std::vector<ndarray_len_t> & shape)373 inline std::string write_header_dict(const std::string& descr, bool fortran_order, const std::vector<ndarray_len_t>& shape) {
374     std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order);
375     std::string shape_s = npy::pyparse::write_tuple(shape);
376 
377     return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order + ", 'shape': " + shape_s + ", }";
378 }
379 
write_header(std::ostream & out,const std::string & descr,bool fortran_order,const std::vector<ndarray_len_t> & shape_v)380 inline void write_header(std::ostream& out, const std::string& descr, bool fortran_order, const std::vector<ndarray_len_t>& shape_v)
381 {
382     std::string header_dict = write_header_dict(descr, fortran_order, shape_v);
383 
384     size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1;
385 
386     unsigned char version[2] = {1, 0};
387     if (length >= 255*255) {
388       length = magic_string_length + 2 + 4 + header_dict.length() + 1;
389       version[0] = 2;
390       version[1] = 0;
391     }
392     size_t padding_len = 16 - length % 16;
393     std::string padding (padding_len, ' ');
394 
395     // write magic
396     write_magic(out, version[0], version[1]);
397 
398     // write header length
399     if (version[0] == 1 && version[1] == 0) {
400       char header_len_le16[2];
401       uint16_t header_len = header_dict.length() + padding.length() + 1;
402 
403       header_len_le16[0] = (header_len >> 0) & 0xff;
404       header_len_le16[1] = (header_len >> 8) & 0xff;
405       out.write(reinterpret_cast<char *>(header_len_le16), 2);
406     }else{
407       char header_len_le32[4];
408       uint32_t header_len = header_dict.length() + padding.length() + 1;
409 
410       header_len_le32[0] = (header_len >> 0) & 0xff;
411       header_len_le32[1] = (header_len >> 8) & 0xff;
412       header_len_le32[2] = (header_len >> 16) & 0xff;
413       header_len_le32[3] = (header_len >> 24) & 0xff;
414       out.write(reinterpret_cast<char *>(header_len_le32), 4);
415     }
416 
417     out << header_dict << padding << '\n';
418 }
419 
read_header(std::istream & istream)420 inline std::string read_header(std::istream& istream) {
421     // check magic bytes an version number
422     unsigned char v_major, v_minor;
423     read_magic(istream, v_major, v_minor);
424 
425     uint32_t header_length;
426     if(v_major == 1 && v_minor == 0){
427 
428       char header_len_le16[2];
429       istream.read(header_len_le16, 2);
430       header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8);
431 
432       if((magic_string_length + 2 + 2 + header_length) % 16 != 0) {
433           // TODO: display warning
434       }
435     }else if(v_major == 2 && v_minor == 0) {
436       char header_len_le32[4];
437       istream.read(header_len_le32, 4);
438 
439       header_length = (header_len_le32[0] <<  0) | (header_len_le32[1] <<  8)
440                     | (header_len_le32[2] << 16) | (header_len_le32[3] <<  24);
441 
442       if((magic_string_length + 2 + 4 + header_length) % 16 != 0) {
443         // TODO: display warning
444       }
445     }else{
446        throw std::runtime_error("unsupported file format version");
447     }
448 
449     auto buf_v = std::vector<char>();
450     buf_v.reserve(header_length);
451     istream.read(buf_v.data(), header_length);
452     std::string header(buf_v.data(), header_length);
453 
454     return header;
455 }
456 
comp_size(const std::vector<ndarray_len_t> & shape)457 inline ndarray_len_t comp_size(const std::vector<ndarray_len_t>& shape) {
458     ndarray_len_t size = 1;
459     for (ndarray_len_t i : shape )
460       size *= i;
461 
462     return size;
463 }
464 
465 template<typename Scalar>
SaveArrayAsNumpy(const std::string & filename,bool fortran_order,unsigned int n_dims,const unsigned long shape[],const std::vector<Scalar> & data)466 inline void SaveArrayAsNumpy( const std::string& filename, bool fortran_order, unsigned int n_dims, const unsigned long shape[], const std::vector<Scalar>& data)
467 {
468     Typestring typestring_o(data);
469     std::string typestring = typestring_o.str();
470 
471     std::ofstream stream( filename, std::ofstream::binary);
472     if(!stream) {
473         throw std::runtime_error("io error: failed to open a file.");
474     }
475 
476     std::vector<ndarray_len_t> shape_v(shape, shape+n_dims);
477     write_header(stream, typestring, fortran_order, shape_v);
478 
479     auto size = static_cast<size_t>(comp_size(shape_v));
480 
481     stream.write(reinterpret_cast<const char*>(data.data()), sizeof(Scalar) * size);
482 }
483 
484 
485 template<typename Scalar>
LoadArrayFromNumpy(const std::string & filename,std::vector<unsigned long> & shape,std::vector<Scalar> & data)486 inline void LoadArrayFromNumpy(const std::string& filename, std::vector<unsigned long>& shape, std::vector<Scalar>& data)
487 {
488     std::ifstream stream(filename, std::ifstream::binary);
489     if(!stream) {
490         throw std::runtime_error("io error: failed to open a file.");
491     }
492 
493     std::string header = read_header(stream);
494 
495     // parse header
496     bool fortran_order;
497     std::string typestr;
498 
499     parse_header(header, typestr, fortran_order, shape);
500 
501     // check if the typestring matches the given one
502     Typestring typestring_o {data};
503     std::string expect_typestr = typestring_o.str();
504     if (typestr != expect_typestr) {
505       throw std::runtime_error("formatting error: typestrings not matching");
506     }
507 
508 
509     // compute the data size based on the shape
510     auto size = static_cast<size_t>(comp_size(shape));
511     data.resize(size);
512 
513     // read the data
514     stream.read(reinterpret_cast<char*>(data.data()), sizeof(Scalar)*size);
515 }
516 
517 } // namespace npy
518 
519 #endif // NPY_H
520