1// Copyright 2018 The Wuffs Authors. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// https://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// +build ignore 16 17package main 18 19// print-lzw-example.go prints a worked example discussed in std/lzw/README.md, 20// based on three implementations of a simplified core of the LZW algorithm. 21// Simplifications include assuming LSB-first, 8 bit literals, no clear codes, 22// no full tables and that the input is given in terms of a code stream, not a 23// bit stream. 24// 25// The implementations are designed for ease of studying, and for e.g. a 26// minimal diff between the suf1 and sufQ implementations, and are not 27// optimized for performance. 28// 29// Usage: go run print-lzw-example.go 30// 31// You can also play with the code (e.g. modify Q and re-run) online: 32// https://play.golang.org/p/1aLC_XHZzoA 33 34import ( 35 "fmt" 36) 37 38const ( 39 expectedOutput = "TOBEORNOTTOBEORTOBEORNOTXOTXOTXOOTXOOOTXOOOTOBEY" 40 41 clearCode = 0x100 42 endCode = 0x101 43 44 // Neither clearCode or endCode can ever be a valid prefix code. We 45 // arbitrarily pick clearCode to represent "no prefix". 46 noPrefix = clearCode 47 48 // Q is the maximum possible suffix length, in bytes, which equals 1 plus 49 // the maximum possible "suffix length minus 1". Q == 1 means that sufQ is 50 // equivalent to suf1. 51 // 52 // In std/lzw/README.md, Q is 3 to keep the example short and simple. In 53 // practice, especially on modern CPUs that can do unaligned 32 or 64 bit 54 // loads and stores, Q is 4 or 8. 55 Q = 3 56) 57 58var codes = []int{ 59 'T', 'O', 'B', 'E', 'O', 'R', 'N', 'O', 'T', 60 0x102, 0x104, 0x106, 0x10B, 0x105, 0x107, 0x109, 61 'X', 62 0x111, 0x113, 0x114, 0x115, 0x10E, 63 'Y', 64 0x101, 65} 66 67func codeString(code int) string { 68 if code < clearCode { 69 return fmt.Sprintf("%5c", code) 70 } else if code == noPrefix { 71 return " -" 72 } 73 return fmt.Sprintf("0x%03X", code) 74} 75 76func main() { 77 n, s, q := new(naive), new(suf1), new(sufQ) 78 decode(n) 79 decode(s) 80 decode(q) 81 82 key := endCode 83 fmt.Printf(" Code Emits Key Value Pre1+Suf1 LM1 /Q %%Q PreQ+SufQ\n") 84 for _, code := range codes { 85 // Code 86 fmt.Printf("%s", codeString(code)) 87 88 // Emits 89 if code == endCode { 90 fmt.Printf(" -end-\n") 91 break 92 } 93 fmt.Printf(" %6s", n[code]) 94 95 if key != endCode { 96 fmt.Printf(" 0x%03X %7s %s %c %3d %3d %3d %s %s", 97 key, // Key 98 n[key], // Value 99 codeString(int(s.prefixes[key])), // Pre1 100 s.suffixes[key], // Suf1 101 q.lengthM1s[key], // LM1 102 q.lengthM1s[key]/Q, // /Q 103 q.lengthM1s[key]%Q, // %Q 104 codeString(int(q.prefixes[key])), // PreQ 105 string(q.suffixes[key][:]), // SufQ 106 ) 107 } 108 109 fmt.Println() 110 key++ 111 } 112} 113 114type implementation interface { 115 initialize() 116 emit(buffer []byte, code int, prevCode int, n int) []byte 117} 118 119func decode(impl implementation) { 120 impl.initialize() 121 122 output, prevCode, n := []byte(nil), -1, 0x101 123 for _, code := range codes { 124 if code == clearCode { 125 panic("unimplemented clearCode") 126 } else if code == endCode { 127 if string(output) != expectedOutput { 128 panic("unexpected output") 129 } 130 return 131 } else if code > n { 132 panic("invalid code") 133 } 134 135 // We have a literal or copy code. 136 output = impl.emit(output, code, prevCode, n) 137 138 prevCode, n = code, n+1 139 if n >= 4096 { 140 panic("unimplemented table-is-full") 141 } 142 } 143 panic("missing endCode") 144} 145 146// naive is a simple implementation that, in the worst case, requires O(N*N) 147// memory. 148type naive [4096]string 149 150func (t *naive) initialize() { 151 for i := 0; i < clearCode; i++ { 152 t[i] = string([]byte{byte(i)}) 153 } 154} 155 156func (t *naive) emit(output []byte, code int, prevCode int, n int) []byte { 157 if prevCode >= 0 { 158 prevOutput := t[prevCode] 159 if code < n { 160 t[n] = prevOutput + t[code][:1] 161 } else { 162 t[n] = prevOutput + prevOutput[:1] 163 } 164 } 165 return append(output, t[code]...) 166} 167 168// suf1 keeps separate prefix and suffix tables, 1 byte per suffix. 169type suf1 struct { 170 prefixes [4096]uint16 171 suffixes [4096]byte 172} 173 174func (t *suf1) initialize() { 175 for i := 0; i < clearCode; i++ { 176 t.prefixes[i] = noPrefix 177 t.suffixes[i] = byte(i) 178 } 179} 180 181func (t *suf1) emit(output []byte, code int, prevCode int, n int) []byte { 182 // Fill an intermediate buffer from back to front, 1 byte at a time. 183 buffer := [4096]byte{} 184 i := len(buffer) 185 186 c := code 187 if code == n { 188 c = prevCode 189 i-- 190 // buffer[4095] will be set later. 191 } 192 193 firstByte := byte(0) 194 for { 195 suffix := t.suffixes[c] 196 i-- 197 buffer[i] = suffix 198 c = int(t.prefixes[c]) 199 if c == noPrefix { 200 firstByte = suffix 201 break 202 } 203 } 204 205 if code == n { 206 buffer[4095] = firstByte 207 } 208 209 // The "if prevCode >= 0" guard, and initializing prevCode to -1 instead of 210 // 0, is correct conceptually, but unnecessary in practice. Look for "fake 211 // key-value pair" in std/lzw/README.md. 212 if prevCode >= 0 { 213 t.prefixes[n] = uint16(prevCode) 214 t.suffixes[n] = firstByte 215 } 216 217 return append(output, buffer[i:4096]...) 218} 219 220// sufQ keeps separate prefix and suffix tables, up to Q bytes per suffix. 221type sufQ struct { 222 lengthM1s [4096]uint16 223 prefixes [4096]uint16 224 suffixes [4096][Q]byte 225} 226 227func (t *sufQ) initialize() { 228 for i := 0; i < clearCode; i++ { 229 t.lengthM1s[i] = 0 230 t.prefixes[i] = noPrefix 231 t.suffixes[i][0] = byte(i) 232 for j := 1; j < Q; j++ { 233 // Unnecessary for correct output, but makes the printed table prettier. 234 t.suffixes[i][j] = '.' 235 } 236 } 237} 238 239func (t *sufQ) emit(output []byte, code int, prevCode int, n int) []byte { 240 // Fill an intermediate buffer from back to front, Q bytes at a time. 241 buffer := [4096 + Q - 1]byte{} 242 i := len(buffer) 243 244 c := code 245 if code == n { 246 c = prevCode 247 i-- 248 // buffer[4095] will be set later. 249 } 250 251 i -= int(t.lengthM1s[c] % Q) 252 253 firstByte := byte(0) 254 for { 255 suffix := t.suffixes[c] 256 i -= Q 257 copy(buffer[i:i+Q], suffix[:]) 258 c = int(t.prefixes[c]) 259 if c == noPrefix { 260 firstByte = suffix[0] 261 break 262 } 263 } 264 265 if code == n { 266 buffer[4095] = firstByte 267 } 268 269 // The "if prevCode >= 0" guard, and initializing prevCode to -1 instead of 270 // 0, is correct conceptually, but unnecessary in practice. Look for "fake 271 // key-value pair" in std/lzw/README.md. 272 if prevCode >= 0 { 273 lm1 := t.lengthM1s[prevCode] + 1 274 t.lengthM1s[n] = lm1 275 lm1 %= Q 276 if lm1 != 0 { 277 // Copy the prevCode's prefix and suffix, and then extend the suffix. 278 t.prefixes[n] = t.prefixes[prevCode] 279 t.suffixes[n] = t.suffixes[prevCode] 280 t.suffixes[n][lm1] = firstByte 281 } else { 282 // Start a new suffix. 283 t.prefixes[n] = uint16(prevCode) 284 t.suffixes[n][0] = firstByte 285 for j := 1; j < Q; j++ { 286 // Unnecessary for correct output, but makes the printed table prettier. 287 t.suffixes[n][j] = '.' 288 } 289 } 290 } 291 292 return append(output, buffer[i:4096]...) 293} 294