1 // Copyright 2020 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 #pragma once
15 
16 #include <stddef.h>
17 #include <stdint.h>
18 
19 #include "pw_polyfill/standard.h"
20 #include "pw_preprocessor/util.h"
21 
22 // The hash implementation uses std::string_view. Use the C implementation when
23 // compiling with older C++ standards.
24 #if PW_CXX_STANDARD_IS_SUPPORTED(17)
25 
26 #include <string_view>
27 
28 #include "pw_preprocessor/compiler.h"
29 #include "pw_tokenizer/config.h"
30 
31 namespace pw::tokenizer {
32 
33 // The constant to use when generating the hash. Changing this changes the value
34 // of all hashes, so do not change it randomly.
35 inline constexpr uint32_t k65599HashConstant = 65599u;
36 
37 // Calculates the hash of a string. This function calculates hashes at either
38 // runtime or compile time in C++ code.
39 //
40 // Unlike the C hashing macro, this hash supports strings of any length. Strings
41 // longer than the maximum C hashing macro's length will hash different values
42 // in C and C++. If the same very long string is used in C and C++, the string
43 // will appear with both tokens in the resulting database.
44 //
45 // This hash is calculated with the following equation, where s is the string
46 // and k is the maximum hash length:
47 //
48 //    H(s, k) = len(s) + 65599 * s[0] + 65599^2 * s[1] + ... + 65599^k * s[k-1]
49 //
50 // The hash algorithm is a modified version of the x65599 hash used by the SDBM
51 // open source project. The modifications were made to support hashing in C
52 // macros. These are the differences from x65599:
53 //
54 //   - Characters are hashed in reverse order.
55 //   - The string length is hashed as the first character in the string.
56 //
Hash(std::string_view string)57 constexpr uint32_t Hash(std::string_view string)
58     PW_NO_SANITIZE("unsigned-integer-overflow") {
59   // The length is hashed as if it were the first character.
60   uint32_t hash = string.size();
61   uint32_t coefficient = k65599HashConstant;
62 
63   // Hash all of the characters in the string as unsigned ints.
64   // The coefficient calculation is done modulo 0x100000000, so the unsigned
65   // integer overflows are intentional.
66   for (uint8_t ch : string) {
67     hash += coefficient * ch;
68     coefficient *= k65599HashConstant;
69   }
70 
71   return hash;
72 }
73 
74 // Take the string as an array to support either literals or character arrays,
75 // but not const char*.
76 template <size_t kSize>
Hash(const char (& string)[kSize])77 constexpr uint32_t Hash(const char (&string)[kSize]) {
78   static_assert(kSize > 0);
79   return Hash(std::string_view(string, kSize - 1));
80 }
81 
82 // This hash function is equivalent to the C hashing macros. It hashses a string
83 // up to a maximum length.
84 constexpr uint32_t PwTokenizer65599FixedLengthHash(
85     std::string_view string,
86     size_t hash_length = PW_TOKENIZER_CFG_C_HASH_LENGTH)
87     PW_NO_SANITIZE("unsigned-integer-overflow") {
88   uint32_t hash = string.size();
89   uint32_t coefficient = k65599HashConstant;
90 
91   for (uint8_t ch : string.substr(0, hash_length)) {
92     hash += coefficient * ch;
93     coefficient *= k65599HashConstant;
94   }
95 
96   return hash;
97 }
98 
99 // Character array version of PwTokenizer65599FixedLengthHash.
100 template <size_t kSize>
101 constexpr uint32_t PwTokenizer65599FixedLengthHash(
102     const char (&string)[kSize],
103     size_t hash_length = PW_TOKENIZER_CFG_C_HASH_LENGTH) {
104   static_assert(kSize > 0);
105   return PwTokenizer65599FixedLengthHash(std::string_view(string, kSize - 1),
106                                          hash_length);
107 }
108 
109 }  // namespace pw::tokenizer
110 
111 #endif  // PW_CXX_STANDARD_IS_SUPPORTED(17)
112 
113 // C version of the fixed-length hash. Can be used to calculate hashes
114 // equivalent to the hashing macros at runtime in C.
115 PW_EXTERN_C uint32_t pw_tokenizer_65599FixedLengthHash(const char* string,
116                                                        size_t string_length,
117                                                        size_t hash_length);
118