1// Copyright 2018 The Wuffs Authors. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// https://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// +build ignore 16 17package main 18 19// crawl.go crawls a list of HTTP and HTTPS URLs. If the URL yields an HTML 20// file, that file is parsed and the "<img src=etc>" tags within it are 21// followed (but not recursively). 22// 23// The crawler writes files to disk with filenames based on the hash of the 24// content (thus de-duplicating e.g. a site's 404 Not Found page even if served 25// from multiple URLs). It also writes a manifest.tsv file that records the 26// mapping from the original URL to that content hash. 27// 28// Usage: go run crawl.go -outdir foo -infile urls.txt 2>log.txt 29 30import ( 31 "bufio" 32 "bytes" 33 "crypto/sha256" 34 "flag" 35 "fmt" 36 "io/ioutil" 37 "log" 38 "math/rand" 39 "net/http" 40 "net/url" 41 "os" 42 "path/filepath" 43 "sort" 44 "strconv" 45 "sync" 46 "time" 47 48 "golang.org/x/net/html" 49) 50 51var ( 52 datadirlevelsFlag = flag.Int("datadirlevels", 1, 53 "number of directories in the ab/cd/efgh output data filenames; valid range is [0..8]") 54 flushfreqFlag = flag.Int("flushfreq", 256, "write out in-progress manifest.tsv on every flushfreq entries") 55 httptimeoutFlag = flag.Duration("httptimeout", 30*time.Second, "HTTP Client timeout; zero means unlimited") 56 infileFlag = flag.String("infile", "", "source file containing URLs to crawl") 57 inlimitFlag = flag.Int("inlimit", -1, "if non-negative, the maximum number of input URLs") 58 nworkersFlag = flag.Int("nworkers", 16, "number of concurrent crawler workers") 59 outdirFlag = flag.String("outdir", "", "directory to place crawled data") 60 sleepFlag = flag.Duration("sleep", 1*time.Second, 61 "minimum duration to sleep between HTTP requests to the same domain") 62) 63 64func main() { 65 if err := main1(); err != nil { 66 os.Stderr.WriteString(err.Error() + "\n") 67 os.Exit(1) 68 } 69} 70 71func main1() error { 72 flag.Parse() 73 if *datadirlevelsFlag < 0 || 8 < *datadirlevelsFlag { 74 return fmt.Errorf("-datadirlevels out of bounds [0..8]") 75 } 76 if *infileFlag == "" { 77 return fmt.Errorf("-infile not given") 78 } 79 if *outdirFlag == "" { 80 return fmt.Errorf("-outdir not given") 81 } 82 83 urlsGroupedByDomain, err := parseInfile() 84 if err != nil { 85 return err 86 } 87 88 global.manifest, err = parseManifest() 89 if err != nil { 90 return err 91 } 92 93 wg := sync.WaitGroup{} 94 urlsChan := make(chan []*url.URL) 95 for i := 0; i < *nworkersFlag; i++ { 96 wg.Add(1) 97 w := worker(i) 98 go w.work(&wg, urlsChan) 99 } 100 for _, u := range urlsGroupedByDomain { 101 urlsChan <- u 102 } 103 close(urlsChan) 104 log.Printf("master: no more work (inlimit is %d)", *inlimitFlag) 105 wg.Wait() 106 107 return flush(global.manifest.m) 108} 109 110var global struct { 111 manifest manifest 112} 113 114const keySize = sha256.Size // 32 bytes. 115 116type key [keySize]byte 117 118func (k key) Cmp(l key) int { 119 for a := 0; a < keySize; a++ { 120 if k[a] < l[a] { 121 return -1 122 } else if k[a] > l[a] { 123 return +1 124 } 125 } 126 return 0 127} 128 129func (k key) Str(levels int) string { 130 const hex = "0123456789abcdef" 131 var b [128]byte 132 n := 0 133 for i, x := range k { 134 b[n+0] = hex[x>>4] 135 b[n+1] = hex[x&0xF] 136 n += 2 137 if i < levels { 138 b[n] = '/' 139 n++ 140 } 141 } 142 return string(b[:n]) 143} 144 145func hash(b []byte) key { 146 return sha256.Sum256(b) 147} 148 149func unhex(b byte) uint8 { 150 if '0' <= b && b <= '9' { 151 return b - '0' 152 } 153 if 'a' <= b && b <= 'f' { 154 return b + 10 - 'a' 155 } 156 return 0xFF 157} 158 159func parseHash(b []byte) (k key, ok bool) { 160 i := 0 161 for i < keySize && len(b) >= 2 { 162 if b[0] == '/' { 163 b = b[1:] 164 continue 165 } 166 u0 := unhex(b[0]) 167 u1 := unhex(b[1]) 168 if u0 > 15 || u1 > 15 { 169 return key{}, false 170 } 171 k[i] = u0<<4 | u1 172 i++ 173 b = b[2:] 174 } 175 return k, i == keySize && len(b) == 0 176} 177 178type entry struct { 179 contentHash key 180 urlHash key 181 httpStatusCode uint32 182 sniffedMimeType string 183 size uint64 184 url *url.URL 185} 186 187type manifest struct { 188 lock sync.Mutex 189 m map[key]entry // maps from urlHash to entry. 190 numPending int 191 flushing bool 192} 193 194func (m *manifest) get(k key) entry { 195 m.lock.Lock() 196 e := m.m[k] 197 m.lock.Unlock() 198 return e 199} 200 201func (m *manifest) put(k key, e entry) { 202 clone := map[key]entry(nil) 203 204 m.lock.Lock() 205 if m.m == nil { 206 m.m = map[key]entry{} 207 } 208 m.m[k] = e 209 m.numPending++ 210 if m.numPending >= *flushfreqFlag && !m.flushing { 211 m.numPending = 0 212 m.flushing = true 213 clone = make(map[key]entry, len(m.m)) 214 for mk, mv := range m.m { 215 clone[mk] = mv 216 } 217 } 218 m.lock.Unlock() 219 220 if clone != nil { 221 if err := flush(clone); err != nil { 222 log.Println(err) 223 } 224 225 m.lock.Lock() 226 m.flushing = false 227 m.lock.Unlock() 228 } 229} 230 231func flush(m map[key]entry) error { 232 log.Printf("Write manifest.tsv (%d entries) started...", len(m)) 233 defer log.Printf("Write manifest.tsv (%d entries) finished.", len(m)) 234 235 keys := make([][2]key, 0, len(m)) 236 for _, v := range m { 237 keys = append(keys, [2]key{v.contentHash, v.urlHash}) 238 } 239 240 sort.Slice(keys, func(i, j int) bool { 241 if cmp := keys[i][0].Cmp(keys[j][0]); cmp != 0 { 242 return cmp < 0 243 } 244 if cmp := keys[i][1].Cmp(keys[j][1]); cmp != 0 { 245 return cmp < 0 246 } 247 return false 248 }) 249 250 filename0 := filepath.Join(*outdirFlag, "manifest.tsv.inprogress") 251 filename1 := filepath.Join(*outdirFlag, "manifest.tsv") 252 253 f, err := os.Create(filename0) 254 if err != nil { 255 return err 256 } 257 258 w := bufio.NewWriter(f) 259 fmt.Fprintf(w, "#ContentHash\tURLHash\tHTTPStatusCode\tSniffedMIMEType\tSize\tURL\n") 260 for _, k := range keys { 261 e := m[k[1]] 262 fmt.Fprintf(w, "%s\t%s\t%d\t%s\t%d\t%v\n", 263 e.contentHash.Str(*datadirlevelsFlag), e.urlHash.Str(0), 264 e.httpStatusCode, e.sniffedMimeType, e.size, e.url) 265 } 266 if err := w.Flush(); err != nil { 267 f.Close() 268 return err 269 } 270 if err := f.Close(); err != nil { 271 return err 272 } 273 274 return os.Rename(filename0, filename1) 275} 276 277func parseInfile() (map[string][]*url.URL, error) { 278 f, err := os.Open(*infileFlag) 279 if err != nil { 280 return nil, err 281 } 282 defer f.Close() 283 284 m := map[string][]*url.URL{} 285 286 n := 0 287 s := bufio.NewScanner(f) 288 for s.Scan() { 289 b := s.Bytes() 290 291 // Strip leading whitespace (space, tab or other control character). 292 for len(b) > 0 && b[0] <= ' ' { 293 b = b[1:] 294 } 295 296 // Skip empty lines or comments starting with "#". 297 if len(b) == 0 || b[0] == '#' { 298 continue 299 } 300 301 // Strip everything up to the first whitespace. 302 for i, x := range b { 303 if x <= ' ' { 304 b = b[:i] 305 break 306 } 307 } 308 309 u, err := url.Parse(string(b)) 310 if err != nil { 311 continue 312 } 313 314 if *inlimitFlag >= 0 && n == *inlimitFlag { 315 break 316 } 317 n++ 318 319 m[u.Host] = append(m[u.Host], u) 320 } 321 if err := s.Err(); err != nil { 322 return nil, err 323 } 324 325 return m, nil 326} 327 328func parseManifest() (manifest, error) { 329 f, err := os.Open(filepath.Join(*outdirFlag, "manifest.tsv")) 330 if err != nil { 331 if os.IsNotExist(err) { 332 err = nil 333 } 334 return manifest{}, err 335 } 336 defer f.Close() 337 338 m := map[key]entry{} 339 340 s := bufio.NewScanner(f) 341 for s.Scan() { 342 b := s.Bytes() 343 344 // Strip leading whitespace (space, tab or other control character). 345 for len(b) > 0 && b[0] <= ' ' { 346 b = b[1:] 347 } 348 349 // Skip empty lines or comments starting with "#". 350 if len(b) == 0 || b[0] == '#' { 351 continue 352 } 353 354 e := parseEntry(b) 355 if e.url == nil { 356 continue 357 } 358 359 m[e.urlHash] = e 360 } 361 if err := s.Err(); err != nil { 362 return manifest{}, err 363 } 364 365 return manifest{m: m}, nil 366} 367 368func parseEntry(b []byte) (e entry) { 369 if i := bytes.IndexByte(b, '\t'); i < 0 { 370 return entry{} 371 } else if h, ok := parseHash(b[:i]); !ok { 372 return entry{} 373 } else { 374 e.contentHash = h 375 b = b[i+1:] 376 } 377 378 if i := bytes.IndexByte(b, '\t'); i < 0 { 379 return entry{} 380 } else if h, ok := parseHash(b[:i]); !ok { 381 return entry{} 382 } else { 383 e.urlHash = h 384 b = b[i+1:] 385 } 386 387 if i := bytes.IndexByte(b, '\t'); i < 0 { 388 return entry{} 389 } else if u, err := strconv.ParseUint(string(b[:i]), 10, 32); err != nil { 390 return entry{} 391 } else { 392 e.httpStatusCode = uint32(u) 393 b = b[i+1:] 394 } 395 396 if i := bytes.IndexByte(b, '\t'); i < 0 { 397 return entry{} 398 } else { 399 e.sniffedMimeType = string(b[:i]) 400 b = b[i+1:] 401 } 402 403 if i := bytes.IndexByte(b, '\t'); i < 0 { 404 return entry{} 405 } else if u, err := strconv.ParseUint(string(b[:i]), 10, 64); err != nil { 406 return entry{} 407 } else { 408 e.size = uint64(u) 409 b = b[i+1:] 410 } 411 412 if u, err := url.Parse(string(b)); err != nil { 413 return entry{} 414 } else { 415 e.url = u 416 } 417 418 return e 419} 420 421func sleep(rng *rand.Rand) { 422 jitter := rng.Int63n(int64(*sleepFlag)) 423 time.Sleep(*sleepFlag + time.Duration(jitter)) 424} 425 426type worker int 427 428func (w worker) work(wg *sync.WaitGroup, urlsChan chan []*url.URL) { 429 rng := rand.New(rand.NewSource(time.Now().UnixNano())) 430 defer wg.Done() 431 for urls := range urlsChan { 432 for _, u := range urls { 433 e, links, fetched := w.work1(u, true) 434 if fetched { 435 sleep(rng) 436 } 437 438 for _, u := range links { 439 if ee, _, fetched := w.work1(u, false); fetched { 440 sleep(rng) 441 if ee.url != nil { 442 global.manifest.put(ee.urlHash, ee) 443 } 444 } 445 } 446 447 if fetched && e.url != nil { 448 global.manifest.put(e.urlHash, e) 449 } 450 } 451 } 452 log.Printf("worker #%03d: no more work", w) 453} 454 455func (w worker) work1(u *url.URL, followHTML bool) (e entry, links []*url.URL, fetched bool) { 456 if u.Scheme != "http" && u.Scheme != "https" { 457 return entry{}, nil, false 458 } 459 460 urlString := u.String() 461 urlHash := hash([]byte(urlString)) 462 e = global.manifest.get(urlHash) 463 if e.url != nil { 464 log.Printf("worker #%03d: skipping %q", w, urlString) 465 return e, nil, false 466 } 467 log.Printf("worker #%03d: fetching %q", w, urlString) 468 469 statusCode, data, err := fetch(urlString) 470 if err != nil { 471 log.Printf("worker #%03d: %v", w, err) 472 return entry{}, nil, true 473 } 474 e = entry{ 475 contentHash: hash(data), 476 urlHash: urlHash, 477 httpStatusCode: statusCode, 478 sniffedMimeType: sniff(data), 479 size: uint64(len(data)), 480 url: u, 481 } 482 483 filename := filepath.Join(*outdirFlag, "data", filepath.FromSlash(e.contentHash.Str(*datadirlevelsFlag))) 484 if _, err := os.Stat(filename); os.IsNotExist(err) { 485 log.Printf("worker #%03d: writing %q", w, urlString) 486 os.MkdirAll(filepath.Dir(filename), 0755) 487 if err := ioutil.WriteFile(filename, data, 0644); err != nil { 488 log.Println(err) 489 return entry{}, nil, true 490 } 491 } 492 493 if followHTML && looksLikeHTML(data) { 494 log.Printf("worker #%03d: parsing %q", w, urlString) 495 links = parseHTML(u, data) 496 } 497 return e, links, true 498} 499 500func fetch(urlString string) (statusCode uint32, body []byte, retErr error) { 501 client := &http.Client{ 502 Timeout: *httptimeoutFlag, 503 } 504 505 res, err := client.Get(urlString) 506 if err != nil { 507 return 0, nil, err 508 } 509 defer res.Body.Close() 510 511 body, err = ioutil.ReadAll(res.Body) 512 if err != nil { 513 return 0, nil, err 514 } 515 return uint32(res.StatusCode), body, nil 516} 517 518func parseHTML(u *url.URL, data []byte) (links []*url.URL) { 519 z := html.NewTokenizer(bytes.NewReader(data)) 520 for { 521 tt := z.Next() 522 if tt == html.ErrorToken { 523 break 524 } 525 if tt != html.StartTagToken && tt != html.SelfClosingTagToken { 526 continue 527 } 528 tn, hasAttr := z.TagName() 529 if len(tn) != 3 || string(tn) != "img" { 530 continue 531 } 532 for hasAttr { 533 key, val, moreAttr := z.TagAttr() 534 if len(key) == 3 && string(key) == "src" { 535 if v, err := url.Parse(string(val)); err == nil { 536 links = append(links, u.ResolveReference(v)) 537 } 538 } 539 hasAttr = moreAttr 540 } 541 } 542 return links 543} 544 545var ( 546 prefixBM = []byte("BM") 547 prefixGIF = []byte("GIF8") 548 prefixJPEG = []byte("\xFF\xD8") 549 prefixPNG = []byte("\x89PNG\r\n\x1A\n") 550 prefixRIFF = []byte("RIFF") 551 prefixTIFFBE = []byte("II\x2A\x00") 552 prefixTIFFLE = []byte("MM\x00\x2A") 553 prefixWEBP = []byte("WEBP") 554 prefixZZZZ = []byte("\x00\x00\x00\x00") 555) 556 557func sniff(b []byte) string { 558 switch { 559 case bytes.HasPrefix(b, prefixBM): 560 if len(b) > 6 && bytes.HasPrefix(b[6:], prefixZZZZ) { 561 return "image/bmp" 562 } 563 case bytes.HasPrefix(b, prefixGIF): 564 return "image/gif" 565 case bytes.HasPrefix(b, prefixJPEG): 566 return "image/jpeg" 567 case bytes.HasPrefix(b, prefixPNG): 568 return "image/png" 569 case bytes.HasPrefix(b, prefixRIFF): 570 if len(b) > 8 && bytes.HasPrefix(b[8:], prefixWEBP) { 571 return "image/webp" 572 } 573 case bytes.HasPrefix(b, prefixTIFFBE), bytes.HasPrefix(b, prefixTIFFLE): 574 return "image/tiff" 575 } 576 577 if looksLikeHTML(b) { 578 return "text/html" 579 } 580 return "unknown" 581} 582 583func looksLikeHTML(b []byte) bool { 584 for len(b) > 0 { 585 if b[0] > ' ' { 586 return b[0] == '<' 587 } 588 b = b[1:] 589 } 590 return false 591} 592