• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2018 The Wuffs Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//    https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// +build ignore
16
17package main
18
19// crawl.go crawls a list of HTTP and HTTPS URLs. If the URL yields an HTML
20// file, that file is parsed and the "<img src=etc>" tags within it are
21// followed (but not recursively).
22//
23// The crawler writes files to disk with filenames based on the hash of the
24// content (thus de-duplicating e.g. a site's 404 Not Found page even if served
25// from multiple URLs). It also writes a manifest.tsv file that records the
26// mapping from the original URL to that content hash.
27//
28// Usage: go run crawl.go -outdir foo -infile urls.txt 2>log.txt
29
30import (
31	"bufio"
32	"bytes"
33	"crypto/sha256"
34	"flag"
35	"fmt"
36	"io/ioutil"
37	"log"
38	"math/rand"
39	"net/http"
40	"net/url"
41	"os"
42	"path/filepath"
43	"sort"
44	"strconv"
45	"sync"
46	"time"
47
48	"golang.org/x/net/html"
49)
50
51var (
52	datadirlevelsFlag = flag.Int("datadirlevels", 1,
53		"number of directories in the ab/cd/efgh output data filenames; valid range is [0..8]")
54	flushfreqFlag   = flag.Int("flushfreq", 256, "write out in-progress manifest.tsv on every flushfreq entries")
55	httptimeoutFlag = flag.Duration("httptimeout", 30*time.Second, "HTTP Client timeout; zero means unlimited")
56	infileFlag      = flag.String("infile", "", "source file containing URLs to crawl")
57	inlimitFlag     = flag.Int("inlimit", -1, "if non-negative, the maximum number of input URLs")
58	nworkersFlag    = flag.Int("nworkers", 16, "number of concurrent crawler workers")
59	outdirFlag      = flag.String("outdir", "", "directory to place crawled data")
60	sleepFlag       = flag.Duration("sleep", 1*time.Second,
61		"minimum duration to sleep between HTTP requests to the same domain")
62)
63
64func main() {
65	if err := main1(); err != nil {
66		os.Stderr.WriteString(err.Error() + "\n")
67		os.Exit(1)
68	}
69}
70
71func main1() error {
72	flag.Parse()
73	if *datadirlevelsFlag < 0 || 8 < *datadirlevelsFlag {
74		return fmt.Errorf("-datadirlevels out of bounds [0..8]")
75	}
76	if *infileFlag == "" {
77		return fmt.Errorf("-infile not given")
78	}
79	if *outdirFlag == "" {
80		return fmt.Errorf("-outdir not given")
81	}
82
83	urlsGroupedByDomain, err := parseInfile()
84	if err != nil {
85		return err
86	}
87
88	global.manifest, err = parseManifest()
89	if err != nil {
90		return err
91	}
92
93	wg := sync.WaitGroup{}
94	urlsChan := make(chan []*url.URL)
95	for i := 0; i < *nworkersFlag; i++ {
96		wg.Add(1)
97		w := worker(i)
98		go w.work(&wg, urlsChan)
99	}
100	for _, u := range urlsGroupedByDomain {
101		urlsChan <- u
102	}
103	close(urlsChan)
104	log.Printf("master: no more work (inlimit is %d)", *inlimitFlag)
105	wg.Wait()
106
107	return flush(global.manifest.m)
108}
109
110var global struct {
111	manifest manifest
112}
113
114const keySize = sha256.Size // 32 bytes.
115
116type key [keySize]byte
117
118func (k key) Cmp(l key) int {
119	for a := 0; a < keySize; a++ {
120		if k[a] < l[a] {
121			return -1
122		} else if k[a] > l[a] {
123			return +1
124		}
125	}
126	return 0
127}
128
129func (k key) Str(levels int) string {
130	const hex = "0123456789abcdef"
131	var b [128]byte
132	n := 0
133	for i, x := range k {
134		b[n+0] = hex[x>>4]
135		b[n+1] = hex[x&0xF]
136		n += 2
137		if i < levels {
138			b[n] = '/'
139			n++
140		}
141	}
142	return string(b[:n])
143}
144
145func hash(b []byte) key {
146	return sha256.Sum256(b)
147}
148
149func unhex(b byte) uint8 {
150	if '0' <= b && b <= '9' {
151		return b - '0'
152	}
153	if 'a' <= b && b <= 'f' {
154		return b + 10 - 'a'
155	}
156	return 0xFF
157}
158
159func parseHash(b []byte) (k key, ok bool) {
160	i := 0
161	for i < keySize && len(b) >= 2 {
162		if b[0] == '/' {
163			b = b[1:]
164			continue
165		}
166		u0 := unhex(b[0])
167		u1 := unhex(b[1])
168		if u0 > 15 || u1 > 15 {
169			return key{}, false
170		}
171		k[i] = u0<<4 | u1
172		i++
173		b = b[2:]
174	}
175	return k, i == keySize && len(b) == 0
176}
177
178type entry struct {
179	contentHash     key
180	urlHash         key
181	httpStatusCode  uint32
182	sniffedMimeType string
183	size            uint64
184	url             *url.URL
185}
186
187type manifest struct {
188	lock       sync.Mutex
189	m          map[key]entry // maps from urlHash to entry.
190	numPending int
191	flushing   bool
192}
193
194func (m *manifest) get(k key) entry {
195	m.lock.Lock()
196	e := m.m[k]
197	m.lock.Unlock()
198	return e
199}
200
201func (m *manifest) put(k key, e entry) {
202	clone := map[key]entry(nil)
203
204	m.lock.Lock()
205	if m.m == nil {
206		m.m = map[key]entry{}
207	}
208	m.m[k] = e
209	m.numPending++
210	if m.numPending >= *flushfreqFlag && !m.flushing {
211		m.numPending = 0
212		m.flushing = true
213		clone = make(map[key]entry, len(m.m))
214		for mk, mv := range m.m {
215			clone[mk] = mv
216		}
217	}
218	m.lock.Unlock()
219
220	if clone != nil {
221		if err := flush(clone); err != nil {
222			log.Println(err)
223		}
224
225		m.lock.Lock()
226		m.flushing = false
227		m.lock.Unlock()
228	}
229}
230
231func flush(m map[key]entry) error {
232	log.Printf("Write manifest.tsv (%d entries) started...", len(m))
233	defer log.Printf("Write manifest.tsv (%d entries) finished.", len(m))
234
235	keys := make([][2]key, 0, len(m))
236	for _, v := range m {
237		keys = append(keys, [2]key{v.contentHash, v.urlHash})
238	}
239
240	sort.Slice(keys, func(i, j int) bool {
241		if cmp := keys[i][0].Cmp(keys[j][0]); cmp != 0 {
242			return cmp < 0
243		}
244		if cmp := keys[i][1].Cmp(keys[j][1]); cmp != 0 {
245			return cmp < 0
246		}
247		return false
248	})
249
250	filename0 := filepath.Join(*outdirFlag, "manifest.tsv.inprogress")
251	filename1 := filepath.Join(*outdirFlag, "manifest.tsv")
252
253	f, err := os.Create(filename0)
254	if err != nil {
255		return err
256	}
257
258	w := bufio.NewWriter(f)
259	fmt.Fprintf(w, "#ContentHash\tURLHash\tHTTPStatusCode\tSniffedMIMEType\tSize\tURL\n")
260	for _, k := range keys {
261		e := m[k[1]]
262		fmt.Fprintf(w, "%s\t%s\t%d\t%s\t%d\t%v\n",
263			e.contentHash.Str(*datadirlevelsFlag), e.urlHash.Str(0),
264			e.httpStatusCode, e.sniffedMimeType, e.size, e.url)
265	}
266	if err := w.Flush(); err != nil {
267		f.Close()
268		return err
269	}
270	if err := f.Close(); err != nil {
271		return err
272	}
273
274	return os.Rename(filename0, filename1)
275}
276
277func parseInfile() (map[string][]*url.URL, error) {
278	f, err := os.Open(*infileFlag)
279	if err != nil {
280		return nil, err
281	}
282	defer f.Close()
283
284	m := map[string][]*url.URL{}
285
286	n := 0
287	s := bufio.NewScanner(f)
288	for s.Scan() {
289		b := s.Bytes()
290
291		// Strip leading whitespace (space, tab or other control character).
292		for len(b) > 0 && b[0] <= ' ' {
293			b = b[1:]
294		}
295
296		// Skip empty lines or comments starting with "#".
297		if len(b) == 0 || b[0] == '#' {
298			continue
299		}
300
301		// Strip everything up to the first whitespace.
302		for i, x := range b {
303			if x <= ' ' {
304				b = b[:i]
305				break
306			}
307		}
308
309		u, err := url.Parse(string(b))
310		if err != nil {
311			continue
312		}
313
314		if *inlimitFlag >= 0 && n == *inlimitFlag {
315			break
316		}
317		n++
318
319		m[u.Host] = append(m[u.Host], u)
320	}
321	if err := s.Err(); err != nil {
322		return nil, err
323	}
324
325	return m, nil
326}
327
328func parseManifest() (manifest, error) {
329	f, err := os.Open(filepath.Join(*outdirFlag, "manifest.tsv"))
330	if err != nil {
331		if os.IsNotExist(err) {
332			err = nil
333		}
334		return manifest{}, err
335	}
336	defer f.Close()
337
338	m := map[key]entry{}
339
340	s := bufio.NewScanner(f)
341	for s.Scan() {
342		b := s.Bytes()
343
344		// Strip leading whitespace (space, tab or other control character).
345		for len(b) > 0 && b[0] <= ' ' {
346			b = b[1:]
347		}
348
349		// Skip empty lines or comments starting with "#".
350		if len(b) == 0 || b[0] == '#' {
351			continue
352		}
353
354		e := parseEntry(b)
355		if e.url == nil {
356			continue
357		}
358
359		m[e.urlHash] = e
360	}
361	if err := s.Err(); err != nil {
362		return manifest{}, err
363	}
364
365	return manifest{m: m}, nil
366}
367
368func parseEntry(b []byte) (e entry) {
369	if i := bytes.IndexByte(b, '\t'); i < 0 {
370		return entry{}
371	} else if h, ok := parseHash(b[:i]); !ok {
372		return entry{}
373	} else {
374		e.contentHash = h
375		b = b[i+1:]
376	}
377
378	if i := bytes.IndexByte(b, '\t'); i < 0 {
379		return entry{}
380	} else if h, ok := parseHash(b[:i]); !ok {
381		return entry{}
382	} else {
383		e.urlHash = h
384		b = b[i+1:]
385	}
386
387	if i := bytes.IndexByte(b, '\t'); i < 0 {
388		return entry{}
389	} else if u, err := strconv.ParseUint(string(b[:i]), 10, 32); err != nil {
390		return entry{}
391	} else {
392		e.httpStatusCode = uint32(u)
393		b = b[i+1:]
394	}
395
396	if i := bytes.IndexByte(b, '\t'); i < 0 {
397		return entry{}
398	} else {
399		e.sniffedMimeType = string(b[:i])
400		b = b[i+1:]
401	}
402
403	if i := bytes.IndexByte(b, '\t'); i < 0 {
404		return entry{}
405	} else if u, err := strconv.ParseUint(string(b[:i]), 10, 64); err != nil {
406		return entry{}
407	} else {
408		e.size = uint64(u)
409		b = b[i+1:]
410	}
411
412	if u, err := url.Parse(string(b)); err != nil {
413		return entry{}
414	} else {
415		e.url = u
416	}
417
418	return e
419}
420
421func sleep(rng *rand.Rand) {
422	jitter := rng.Int63n(int64(*sleepFlag))
423	time.Sleep(*sleepFlag + time.Duration(jitter))
424}
425
426type worker int
427
428func (w worker) work(wg *sync.WaitGroup, urlsChan chan []*url.URL) {
429	rng := rand.New(rand.NewSource(time.Now().UnixNano()))
430	defer wg.Done()
431	for urls := range urlsChan {
432		for _, u := range urls {
433			e, links, fetched := w.work1(u, true)
434			if fetched {
435				sleep(rng)
436			}
437
438			for _, u := range links {
439				if ee, _, fetched := w.work1(u, false); fetched {
440					sleep(rng)
441					if ee.url != nil {
442						global.manifest.put(ee.urlHash, ee)
443					}
444				}
445			}
446
447			if fetched && e.url != nil {
448				global.manifest.put(e.urlHash, e)
449			}
450		}
451	}
452	log.Printf("worker #%03d: no more work", w)
453}
454
455func (w worker) work1(u *url.URL, followHTML bool) (e entry, links []*url.URL, fetched bool) {
456	if u.Scheme != "http" && u.Scheme != "https" {
457		return entry{}, nil, false
458	}
459
460	urlString := u.String()
461	urlHash := hash([]byte(urlString))
462	e = global.manifest.get(urlHash)
463	if e.url != nil {
464		log.Printf("worker #%03d: skipping %q", w, urlString)
465		return e, nil, false
466	}
467	log.Printf("worker #%03d: fetching %q", w, urlString)
468
469	statusCode, data, err := fetch(urlString)
470	if err != nil {
471		log.Printf("worker #%03d: %v", w, err)
472		return entry{}, nil, true
473	}
474	e = entry{
475		contentHash:     hash(data),
476		urlHash:         urlHash,
477		httpStatusCode:  statusCode,
478		sniffedMimeType: sniff(data),
479		size:            uint64(len(data)),
480		url:             u,
481	}
482
483	filename := filepath.Join(*outdirFlag, "data", filepath.FromSlash(e.contentHash.Str(*datadirlevelsFlag)))
484	if _, err := os.Stat(filename); os.IsNotExist(err) {
485		log.Printf("worker #%03d: writing %q", w, urlString)
486		os.MkdirAll(filepath.Dir(filename), 0755)
487		if err := ioutil.WriteFile(filename, data, 0644); err != nil {
488			log.Println(err)
489			return entry{}, nil, true
490		}
491	}
492
493	if followHTML && looksLikeHTML(data) {
494		log.Printf("worker #%03d: parsing %q", w, urlString)
495		links = parseHTML(u, data)
496	}
497	return e, links, true
498}
499
500func fetch(urlString string) (statusCode uint32, body []byte, retErr error) {
501	client := &http.Client{
502		Timeout: *httptimeoutFlag,
503	}
504
505	res, err := client.Get(urlString)
506	if err != nil {
507		return 0, nil, err
508	}
509	defer res.Body.Close()
510
511	body, err = ioutil.ReadAll(res.Body)
512	if err != nil {
513		return 0, nil, err
514	}
515	return uint32(res.StatusCode), body, nil
516}
517
518func parseHTML(u *url.URL, data []byte) (links []*url.URL) {
519	z := html.NewTokenizer(bytes.NewReader(data))
520	for {
521		tt := z.Next()
522		if tt == html.ErrorToken {
523			break
524		}
525		if tt != html.StartTagToken && tt != html.SelfClosingTagToken {
526			continue
527		}
528		tn, hasAttr := z.TagName()
529		if len(tn) != 3 || string(tn) != "img" {
530			continue
531		}
532		for hasAttr {
533			key, val, moreAttr := z.TagAttr()
534			if len(key) == 3 && string(key) == "src" {
535				if v, err := url.Parse(string(val)); err == nil {
536					links = append(links, u.ResolveReference(v))
537				}
538			}
539			hasAttr = moreAttr
540		}
541	}
542	return links
543}
544
545var (
546	prefixBM     = []byte("BM")
547	prefixGIF    = []byte("GIF8")
548	prefixJPEG   = []byte("\xFF\xD8")
549	prefixPNG    = []byte("\x89PNG\r\n\x1A\n")
550	prefixRIFF   = []byte("RIFF")
551	prefixTIFFBE = []byte("II\x2A\x00")
552	prefixTIFFLE = []byte("MM\x00\x2A")
553	prefixWEBP   = []byte("WEBP")
554	prefixZZZZ   = []byte("\x00\x00\x00\x00")
555)
556
557func sniff(b []byte) string {
558	switch {
559	case bytes.HasPrefix(b, prefixBM):
560		if len(b) > 6 && bytes.HasPrefix(b[6:], prefixZZZZ) {
561			return "image/bmp"
562		}
563	case bytes.HasPrefix(b, prefixGIF):
564		return "image/gif"
565	case bytes.HasPrefix(b, prefixJPEG):
566		return "image/jpeg"
567	case bytes.HasPrefix(b, prefixPNG):
568		return "image/png"
569	case bytes.HasPrefix(b, prefixRIFF):
570		if len(b) > 8 && bytes.HasPrefix(b[8:], prefixWEBP) {
571			return "image/webp"
572		}
573	case bytes.HasPrefix(b, prefixTIFFBE), bytes.HasPrefix(b, prefixTIFFLE):
574		return "image/tiff"
575	}
576
577	if looksLikeHTML(b) {
578		return "text/html"
579	}
580	return "unknown"
581}
582
583func looksLikeHTML(b []byte) bool {
584	for len(b) > 0 {
585		if b[0] > ' ' {
586			return b[0] == '<'
587		}
588		b = b[1:]
589	}
590	return false
591}
592