1 /*
2 Copyright 2017 Leon Merten Lohse
3
4 Permission is hereby granted, free of charge, to any person obtaining a copy
5 of this software and associated documentation files (the "Software"), to deal
6 in the Software without restriction, including without limitation the rights
7 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 copies of the Software, and to permit persons to whom the Software is
9 furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be included in
12 all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 SOFTWARE.
21 */
22
23 #ifndef NPY_H
24 #define NPY_H
25
26 #include <complex>
27 #include <fstream>
28 #include <string>
29 #include <iostream>
30 #include <sstream>
31 #include <cstdint>
32 #include <cstring>
33 #include <vector>
34 #include <stdexcept>
35 #include <algorithm>
36 #include <regex>
37 #include <unordered_map>
38
39
40 namespace npy {
41
42 /* Compile-time test for byte order.
43 If your compiler does not define these per default, you may want to define
44 one of these constants manually.
45 Defaults to little endian order. */
46 #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || \
47 defined(__BIG_ENDIAN__) || \
48 defined(__ARMEB__) || \
49 defined(__THUMBEB__) || \
50 defined(__AARCH64EB__) || \
51 defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__)
52 const bool big_endian = true;
53 #else
54 const bool big_endian = false;
55 #endif
56
57
58 const char magic_string[] = "\x93NUMPY";
59 const size_t magic_string_length = 6;
60
61 const char little_endian_char = '<';
62 const char big_endian_char = '>';
63 const char no_endian_char = '|';
64
65 constexpr char host_endian_char = ( big_endian ?
66 big_endian_char :
67 little_endian_char );
68
69 /* npy array length */
70 typedef unsigned long int ndarray_len_t;
71
write_magic(std::ostream & ostream,unsigned char v_major=1,unsigned char v_minor=0)72 inline void write_magic(std::ostream& ostream, unsigned char v_major=1, unsigned char v_minor=0) {
73 ostream.write(magic_string, magic_string_length);
74 ostream.put(v_major);
75 ostream.put(v_minor);
76 }
77
read_magic(std::istream & istream,unsigned char & v_major,unsigned char & v_minor)78 inline void read_magic(std::istream& istream, unsigned char& v_major, unsigned char& v_minor) {
79 char buf[magic_string_length+2];
80 istream.read(buf, magic_string_length+2);
81
82 if(!istream) {
83 throw std::runtime_error("io error: failed reading file");
84 }
85
86 if (0 != std::memcmp(buf, magic_string, magic_string_length))
87 throw std::runtime_error("this file does not have a valid npy format.");
88
89 v_major = buf[magic_string_length];
90 v_minor = buf[magic_string_length+1];
91 }
92
93 // typestring magic
94 struct Typestring {
95 private:
96 char c_endian;
97 char c_type;
98 int len;
99
100 public:
strnpy::Typestring101 inline std::string str() {
102 const size_t max_buflen = 16;
103 char buf[max_buflen];
104 std::sprintf(buf, "%c%c%u", c_endian, c_type, len);
105 return std::string(buf);
106 }
107
Typestringnpy::Typestring108 Typestring(const std::vector<float>& v)
109 :c_endian {host_endian_char}, c_type {'f'}, len {sizeof(float)} {}
Typestringnpy::Typestring110 Typestring(const std::vector<double>& v)
111 :c_endian {host_endian_char}, c_type {'f'}, len {sizeof(double)} {}
Typestringnpy::Typestring112 Typestring(const std::vector<long double>& v)
113 :c_endian {host_endian_char}, c_type {'f'}, len {sizeof(long double)} {}
114
Typestringnpy::Typestring115 Typestring(const std::vector<char>& v)
116 :c_endian {no_endian_char}, c_type {'i'}, len {sizeof(char)} {}
Typestringnpy::Typestring117 Typestring(const std::vector<short>& v)
118 :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(short)} {}
Typestringnpy::Typestring119 Typestring(const std::vector<int>& v)
120 :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(int)} {}
Typestringnpy::Typestring121 Typestring(const std::vector<long>& v)
122 :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(long)} {}
Typestringnpy::Typestring123 Typestring(const std::vector<long long>& v) :c_endian {host_endian_char}, c_type {'i'}, len {sizeof(long long)} {}
124
Typestringnpy::Typestring125 Typestring(const std::vector<unsigned char>& v)
126 :c_endian {no_endian_char}, c_type {'u'}, len {sizeof(unsigned char)} {}
Typestringnpy::Typestring127 Typestring(const std::vector<unsigned short>& v)
128 :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned short)} {}
Typestringnpy::Typestring129 Typestring(const std::vector<unsigned int>& v)
130 :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned int)} {}
Typestringnpy::Typestring131 Typestring(const std::vector<unsigned long>& v)
132 :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned long)} {}
Typestringnpy::Typestring133 Typestring(const std::vector<unsigned long long>& v)
134 :c_endian {host_endian_char}, c_type {'u'}, len {sizeof(unsigned long long)} {}
135
Typestringnpy::Typestring136 Typestring(const std::vector<std::complex<float>>& v)
137 :c_endian {host_endian_char}, c_type {'c'}, len {sizeof(std::complex<float>)} {}
Typestringnpy::Typestring138 Typestring(const std::vector<std::complex<double>>& v)
139 :c_endian {host_endian_char}, c_type {'c'}, len {sizeof(std::complex<double>)} {}
Typestringnpy::Typestring140 Typestring(const std::vector<std::complex<long double>>& v)
141 :c_endian {host_endian_char}, c_type {'c'}, len {sizeof(std::complex<long double>)} {}
142 };
143
parse_typestring(std::string typestring)144 inline void parse_typestring( std::string typestring){
145 std::regex re ("'([<>|])([ifuc])(\\d+)'");
146 std::smatch sm;
147
148 std::regex_match(typestring, sm, re );
149
150 if ( sm.size() != 4 ) {
151 throw std::runtime_error("invalid typestring");
152 }
153 }
154
155 namespace pyparse {
156
157 /**
158 Removes leading and trailing whitespaces
159 */
trim(const std::string & str)160 inline std::string trim(const std::string& str) {
161 const std::string whitespace = " \t";
162 auto begin = str.find_first_not_of(whitespace);
163
164 if (begin == std::string::npos)
165 return "";
166
167 auto end = str.find_last_not_of(whitespace);
168
169 return str.substr(begin, end-begin+1);
170 }
171
172
get_value_from_map(const std::string & mapstr)173 inline std::string get_value_from_map(const std::string& mapstr) {
174 size_t sep_pos = mapstr.find_first_of(":");
175 if (sep_pos == std::string::npos)
176 return "";
177
178 std::string tmp = mapstr.substr(sep_pos+1);
179 return trim(tmp);
180 }
181
182 /**
183 Parses the string representation of a Python dict
184
185 The keys need to be known and may not appear anywhere else in the data.
186 */
parse_dict(std::string in,std::vector<std::string> & keys)187 inline std::unordered_map<std::string, std::string> parse_dict(std::string in, std::vector<std::string>& keys) {
188
189 std::unordered_map<std::string, std::string> map;
190
191 if (keys.size() == 0)
192 return map;
193
194 in = trim(in);
195
196 // unwrap dictionary
197 if ((in.front() == '{') && (in.back() == '}'))
198 in = in.substr(1, in.length()-2);
199 else
200 throw std::runtime_error("Not a Python dictionary.");
201
202 std::vector<std::pair<size_t, std::string>> positions;
203
204 for (auto const& value : keys) {
205 size_t pos = in.find( "'" + value + "'" );
206
207 if (pos == std::string::npos)
208 throw std::runtime_error("Missing '"+value+"' key.");
209
210 std::pair<size_t, std::string> position_pair { pos, value };
211 positions.push_back(position_pair);
212 }
213
214 // sort by position in dict
215 std::sort(positions.begin(), positions.end() );
216
217 for(size_t i = 0; i < positions.size(); ++i) {
218 std::string raw_value;
219 size_t begin { positions[i].first };
220 size_t end { std::string::npos };
221
222 std::string key = positions[i].second;
223
224 if ( i+1 < positions.size() )
225 end = positions[i+1].first;
226
227 raw_value = in.substr(begin, end-begin);
228
229 raw_value = trim(raw_value);
230
231 if (raw_value.back() == ',')
232 raw_value.pop_back();
233
234 map[key] = get_value_from_map(raw_value);
235 }
236
237 return map;
238 }
239
240 /**
241 Parses the string representation of a Python boolean
242 */
parse_bool(const std::string & in)243 inline bool parse_bool(const std::string& in) {
244 if (in == "True")
245 return true;
246 if (in == "False")
247 return false;
248
249 throw std::runtime_error("Invalid python boolan.");
250 }
251
252 /**
253 Parses the string representation of a Python str
254 */
parse_str(const std::string & in)255 inline std::string parse_str(const std::string& in) {
256 if ((in.front() == '\'') && (in.back() == '\''))
257 return in.substr(1, in.length()-2);
258
259 throw std::runtime_error("Invalid python string.");
260 }
261
262 /**
263 Parses the string represenatation of a Python tuple into a vector of its items
264 */
parse_tuple(std::string in)265 inline std::vector<std::string> parse_tuple(std::string in) {
266 std::vector<std::string> v;
267 const char seperator = ',';
268
269 in = trim(in);
270
271 if ((in.front() == '(') && (in.back() == ')'))
272 in = in.substr(1, in.length()-2);
273 else
274 throw std::runtime_error("Invalid Python tuple.");
275
276 std::istringstream iss(in);
277
278 for (std::string token; std::getline(iss, token, seperator);) {
279 v.push_back(token);
280 }
281
282 return v;
283 }
284
285 template <typename T>
write_tuple(const std::vector<T> & v)286 inline std::string write_tuple(const std::vector<T>& v) {
287 if (v.size() == 0)
288 return "";
289
290 std::ostringstream ss;
291
292 if (v.size() == 1) {
293 ss << "(" << v.front() << ",)";
294 } else {
295 const std::string delimiter = ", ";
296 // v.size() > 1
297 ss << "(";
298 std::copy(v.begin(), v.end()-1, std::ostream_iterator<T>(ss, delimiter.c_str()));
299 ss << v.back();
300 ss << ")";
301 }
302
303 return ss.str();
304 }
305
write_boolean(bool b)306 inline std::string write_boolean(bool b) {
307 if(b)
308 return "True";
309 else
310 return "False";
311 }
312
313 } // namespace pyparse
314
315
parse_header(std::string header,std::string & descr,bool & fortran_order,std::vector<ndarray_len_t> & shape)316 inline void parse_header(std::string header, std::string& descr, bool& fortran_order, std::vector<ndarray_len_t>& shape) {
317 /*
318 The first 6 bytes are a magic string: exactly "x93NUMPY".
319 The next 1 byte is an unsigned byte: the major version number of the file format, e.g. x01.
320 The next 1 byte is an unsigned byte: the minor version number of the file format, e.g. x00. Note: the version of the file format is not tied to the version of the numpy package.
321 The next 2 bytes form a little-endian unsigned short int: the length of the header data HEADER_LEN.
322 The next HEADER_LEN bytes form the header data describing the array's format. It is an ASCII string which contains a Python literal expression of a dictionary. It is terminated by a newline ('n') and padded with spaces ('x20') to make the total length of the magic string + 4 + HEADER_LEN be evenly divisible by 16 for alignment purposes.
323 The dictionary contains three keys:
324
325 "descr" : dtype.descr
326 An object that can be passed as an argument to the numpy.dtype() constructor to create the array's dtype.
327 "fortran_order" : bool
328 Whether the array data is Fortran-contiguous or not. Since Fortran-contiguous arrays are a common form of non-C-contiguity, we allow them to be written directly to disk for efficiency.
329 "shape" : tuple of int
330 The shape of the array.
331 For repeatability and readability, this dictionary is formatted using pprint.pformat() so the keys are in alphabetic order.
332 */
333
334 // remove trailing newline
335 if (header.back() != '\n')
336 throw std::runtime_error("invalid header");
337 header.pop_back();
338
339 // parse the dictionary
340 std::vector<std::string> keys { "descr", "fortran_order", "shape" };
341 auto dict_map = npy::pyparse::parse_dict(header, keys);
342
343 if (dict_map.size() == 0)
344 throw std::runtime_error("invalid dictionary in header");
345
346 std::string descr_s = dict_map["descr"];
347 std::string fortran_s = dict_map["fortran_order"];
348 std::string shape_s = dict_map["shape"];
349
350 // TODO: extract info from typestring
351 parse_typestring(descr_s);
352 // remove
353 descr = npy::pyparse::parse_str(descr_s);
354
355 // convert literal Python bool to C++ bool
356 fortran_order = npy::pyparse::parse_bool(fortran_s);
357
358 // parse the shape tuple
359 auto shape_v = npy::pyparse::parse_tuple(shape_s);
360 if (shape_v.size() == 0)
361 throw std::runtime_error("invalid shape tuple in header");
362
363 for ( auto item : shape_v ) {
364 std::stringstream stream(item);
365 unsigned long value;
366 stream >> value;
367 ndarray_len_t dim = static_cast<ndarray_len_t>(value);
368 shape.push_back(dim);
369 }
370 }
371
372
write_header_dict(const std::string & descr,bool fortran_order,const std::vector<ndarray_len_t> & shape)373 inline std::string write_header_dict(const std::string& descr, bool fortran_order, const std::vector<ndarray_len_t>& shape) {
374 std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order);
375 std::string shape_s = npy::pyparse::write_tuple(shape);
376
377 return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order + ", 'shape': " + shape_s + ", }";
378 }
379
write_header(std::ostream & out,const std::string & descr,bool fortran_order,const std::vector<ndarray_len_t> & shape_v)380 inline void write_header(std::ostream& out, const std::string& descr, bool fortran_order, const std::vector<ndarray_len_t>& shape_v)
381 {
382 std::string header_dict = write_header_dict(descr, fortran_order, shape_v);
383
384 size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1;
385
386 unsigned char version[2] = {1, 0};
387 if (length >= 255*255) {
388 length = magic_string_length + 2 + 4 + header_dict.length() + 1;
389 version[0] = 2;
390 version[1] = 0;
391 }
392 size_t padding_len = 16 - length % 16;
393 std::string padding (padding_len, ' ');
394
395 // write magic
396 write_magic(out, version[0], version[1]);
397
398 // write header length
399 if (version[0] == 1 && version[1] == 0) {
400 char header_len_le16[2];
401 uint16_t header_len = header_dict.length() + padding.length() + 1;
402
403 header_len_le16[0] = (header_len >> 0) & 0xff;
404 header_len_le16[1] = (header_len >> 8) & 0xff;
405 out.write(reinterpret_cast<char *>(header_len_le16), 2);
406 }else{
407 char header_len_le32[4];
408 uint32_t header_len = header_dict.length() + padding.length() + 1;
409
410 header_len_le32[0] = (header_len >> 0) & 0xff;
411 header_len_le32[1] = (header_len >> 8) & 0xff;
412 header_len_le32[2] = (header_len >> 16) & 0xff;
413 header_len_le32[3] = (header_len >> 24) & 0xff;
414 out.write(reinterpret_cast<char *>(header_len_le32), 4);
415 }
416
417 out << header_dict << padding << '\n';
418 }
419
read_header(std::istream & istream)420 inline std::string read_header(std::istream& istream) {
421 // check magic bytes an version number
422 unsigned char v_major, v_minor;
423 read_magic(istream, v_major, v_minor);
424
425 uint32_t header_length;
426 if(v_major == 1 && v_minor == 0){
427
428 char header_len_le16[2];
429 istream.read(header_len_le16, 2);
430 header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8);
431
432 if((magic_string_length + 2 + 2 + header_length) % 16 != 0) {
433 // TODO: display warning
434 }
435 }else if(v_major == 2 && v_minor == 0) {
436 char header_len_le32[4];
437 istream.read(header_len_le32, 4);
438
439 header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8)
440 | (header_len_le32[2] << 16) | (header_len_le32[3] << 24);
441
442 if((magic_string_length + 2 + 4 + header_length) % 16 != 0) {
443 // TODO: display warning
444 }
445 }else{
446 throw std::runtime_error("unsupported file format version");
447 }
448
449 auto buf_v = std::vector<char>();
450 buf_v.reserve(header_length);
451 istream.read(buf_v.data(), header_length);
452 std::string header(buf_v.data(), header_length);
453
454 return header;
455 }
456
comp_size(const std::vector<ndarray_len_t> & shape)457 inline ndarray_len_t comp_size(const std::vector<ndarray_len_t>& shape) {
458 ndarray_len_t size = 1;
459 for (ndarray_len_t i : shape )
460 size *= i;
461
462 return size;
463 }
464
465 template<typename Scalar>
SaveArrayAsNumpy(const std::string & filename,bool fortran_order,unsigned int n_dims,const unsigned long shape[],const std::vector<Scalar> & data)466 inline void SaveArrayAsNumpy( const std::string& filename, bool fortran_order, unsigned int n_dims, const unsigned long shape[], const std::vector<Scalar>& data)
467 {
468 Typestring typestring_o(data);
469 std::string typestring = typestring_o.str();
470
471 std::ofstream stream( filename, std::ofstream::binary);
472 if(!stream) {
473 throw std::runtime_error("io error: failed to open a file.");
474 }
475
476 std::vector<ndarray_len_t> shape_v(shape, shape+n_dims);
477 write_header(stream, typestring, fortran_order, shape_v);
478
479 auto size = static_cast<size_t>(comp_size(shape_v));
480
481 stream.write(reinterpret_cast<const char*>(data.data()), sizeof(Scalar) * size);
482 }
483
484
485 template<typename Scalar>
LoadArrayFromNumpy(const std::string & filename,std::vector<unsigned long> & shape,std::vector<Scalar> & data)486 inline void LoadArrayFromNumpy(const std::string& filename, std::vector<unsigned long>& shape, std::vector<Scalar>& data)
487 {
488 std::ifstream stream(filename, std::ifstream::binary);
489 if(!stream) {
490 throw std::runtime_error("io error: failed to open a file.");
491 }
492
493 std::string header = read_header(stream);
494
495 // parse header
496 bool fortran_order;
497 std::string typestr;
498
499 parse_header(header, typestr, fortran_order, shape);
500
501 // check if the typestring matches the given one
502 Typestring typestring_o {data};
503 std::string expect_typestr = typestring_o.str();
504 if (typestr != expect_typestr) {
505 throw std::runtime_error("formatting error: typestrings not matching");
506 }
507
508
509 // compute the data size based on the shape
510 auto size = static_cast<size_t>(comp_size(shape));
511 data.resize(size);
512
513 // read the data
514 stream.read(reinterpret_cast<char*>(data.data()), sizeof(Scalar)*size);
515 }
516
517 } // namespace npy
518
519 #endif // NPY_H
520