1// Copyright 2017 Google Inc. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package finder 16 17import ( 18 "bufio" 19 "bytes" 20 "encoding/json" 21 "errors" 22 "fmt" 23 "io" 24 "os" 25 "path/filepath" 26 "runtime" 27 "sort" 28 "strings" 29 "sync" 30 "sync/atomic" 31 "time" 32 33 "android/soong/finder/fs" 34) 35 36// This file provides a Finder struct that can quickly search for files satisfying 37// certain criteria. 38// This Finder gets its speed partially from parallelism and partially from caching. 39// If a Stat call returns the same result as last time, then it means Finder 40// can skip the ReadDir call for that dir. 41 42// The primary data structure used by the finder is the field Finder.nodes , 43// which is a tree of nodes of type *pathMap . 44// Each node represents a directory on disk, along with its stats, subdirectories, 45// and contained files. 46 47// The common use case for the Finder is that the caller creates a Finder and gives 48// it the same query that was given to it in the previous execution. 49// In this situation, the major events that take place are: 50// 1. The Finder begins to load its db 51// 2. The Finder begins to stat the directories mentioned in its db (using multiple threads) 52// Calling Stat on each of these directories is generally a large fraction of the total time 53// 3. The Finder begins to construct a separate tree of nodes in each of its threads 54// 4. The Finder merges the individual node trees into the main node tree 55// 5. The Finder may call ReadDir a few times if there are a few directories that are out-of-date 56// These ReadDir calls might prompt additional Stat calls, etc 57// 6. The Finder waits for all loading to complete 58// 7. The Finder searches the cache for files matching the user's query (using multiple threads) 59 60// These are the invariants regarding concurrency: 61// 1. The public methods of Finder are threadsafe. 62// The public methods are only performance-optimized for one caller at a time, however. 63// For the moment, multiple concurrent callers shouldn't expect any better performance than 64// multiple serial callers. 65// 2. While building the node tree, only one thread may ever access the <children> collection of a 66// *pathMap at once. 67// a) The thread that accesses the <children> collection is the thread that discovers the 68// children (by reading them from the cache or by having received a response to ReadDir). 69// 1) Consequently, the thread that discovers the children also spawns requests to stat 70// subdirs. 71// b) Consequently, while building the node tree, no thread may do a lookup of its 72// *pathMap via filepath because another thread may be adding children to the 73// <children> collection of an ancestor node. Additionally, in rare cases, another thread 74// may be removing children from an ancestor node if the children were only discovered to 75// be irrelevant after calling ReadDir (which happens if a prune-file was just added). 76// 3. No query will begin to be serviced until all loading (both reading the db 77// and scanning the filesystem) is complete. 78// Tests indicate that it only takes about 10% as long to search the in-memory cache as to 79// generate it, making this not a huge loss in performance. 80// 4. The parsing of the db and the initial setup of the pathMap tree must complete before 81// beginning to call listDirSync (because listDirSync can create new entries in the pathMap) 82 83// see cmd/finder.go or finder_test.go for usage examples 84 85// Update versionString whenever making a backwards-incompatible change to the cache file format 86const versionString = "Android finder version 1" 87 88// a CacheParams specifies which files and directories the user wishes be scanned and 89// potentially added to the cache 90type CacheParams struct { 91 // WorkingDirectory is used as a base for any relative file paths given to the Finder 92 WorkingDirectory string 93 94 // RootDirs are the root directories used to initiate the search 95 RootDirs []string 96 97 // ExcludeDirs are directory names that if encountered are removed from the search 98 ExcludeDirs []string 99 100 // PruneFiles are file names that if encountered prune their entire directory 101 // (including siblings) 102 PruneFiles []string 103 104 // IncludeFiles are file names to include as matches 105 IncludeFiles []string 106 107 // IncludeSuffixes are filename suffixes to include as matches. 108 IncludeSuffixes []string 109} 110 111// a cacheConfig stores the inputs that determine what should be included in the cache 112type cacheConfig struct { 113 CacheParams 114 115 // FilesystemView is a unique identifier telling which parts of which file systems 116 // are readable by the Finder. In practice its value is essentially username@hostname. 117 // FilesystemView is set to ensure that a cache file copied to another host or 118 // found by another user doesn't inadvertently get reused. 119 FilesystemView string 120} 121 122func (p *cacheConfig) Dump() ([]byte, error) { 123 bytes, err := json.Marshal(p) 124 return bytes, err 125} 126 127// a cacheMetadata stores version information about the cache 128type cacheMetadata struct { 129 // The Version enables the Finder to determine whether it can even parse the file 130 // If the version changes, the entire cache file must be regenerated 131 Version string 132 133 // The CacheParams enables the Finder to determine whether the parameters match 134 // If the CacheParams change, the Finder can choose how much of the cache file to reuse 135 // (although in practice, the Finder will probably choose to ignore the entire file anyway) 136 Config cacheConfig 137} 138 139type Logger interface { 140 Output(calldepth int, s string) error 141} 142 143// the Finder is the main struct that callers will want to use 144type Finder struct { 145 // configuration 146 DbPath string 147 numDbLoadingThreads int 148 numSearchingThreads int 149 cacheMetadata cacheMetadata 150 logger Logger 151 filesystem fs.FileSystem 152 153 // temporary state 154 threadPool *threadPool 155 mutex sync.Mutex 156 fsErrs []fsErr 157 errlock sync.Mutex 158 shutdownWaitgroup sync.WaitGroup 159 160 // non-temporary state 161 modifiedFlag int32 162 nodes pathMap 163} 164 165var defaultNumThreads = runtime.NumCPU() * 2 166 167// New creates a new Finder for use 168func New(cacheParams CacheParams, filesystem fs.FileSystem, 169 logger Logger, dbPath string) (f *Finder, err error) { 170 return newImpl(cacheParams, filesystem, logger, dbPath, defaultNumThreads) 171} 172 173// newImpl is like New but accepts more params 174func newImpl(cacheParams CacheParams, filesystem fs.FileSystem, 175 logger Logger, dbPath string, numThreads int) (f *Finder, err error) { 176 numDbLoadingThreads := numThreads 177 numSearchingThreads := numThreads 178 179 metadata := cacheMetadata{ 180 Version: versionString, 181 Config: cacheConfig{ 182 CacheParams: cacheParams, 183 FilesystemView: filesystem.ViewId(), 184 }, 185 } 186 187 f = &Finder{ 188 numDbLoadingThreads: numDbLoadingThreads, 189 numSearchingThreads: numSearchingThreads, 190 cacheMetadata: metadata, 191 logger: logger, 192 filesystem: filesystem, 193 194 nodes: *newPathMap("/"), 195 DbPath: dbPath, 196 197 shutdownWaitgroup: sync.WaitGroup{}, 198 } 199 200 f.loadFromFilesystem() 201 202 // check for any filesystem errors 203 err = f.getErr() 204 if err != nil { 205 return nil, err 206 } 207 208 // confirm that every path mentioned in the CacheConfig exists 209 for _, path := range cacheParams.RootDirs { 210 if !filepath.IsAbs(path) { 211 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path) 212 } 213 node := f.nodes.GetNode(filepath.Clean(path), false) 214 if node == nil || node.ModTime == 0 { 215 return nil, fmt.Errorf("path %v was specified to be included in the cache but does not exist\n", path) 216 } 217 } 218 219 return f, nil 220} 221 222// FindNamed searches for every cached file 223func (f *Finder) FindAll() []string { 224 return f.FindAt("/") 225} 226 227// FindNamed searches for every cached file under <rootDir> 228func (f *Finder) FindAt(rootDir string) []string { 229 filter := func(entries DirEntries) (dirNames []string, fileNames []string) { 230 return entries.DirNames, entries.FileNames 231 } 232 return f.FindMatching(rootDir, filter) 233} 234 235// FindNamed searches for every cached file named <fileName> 236func (f *Finder) FindNamed(fileName string) []string { 237 return f.FindNamedAt("/", fileName) 238} 239 240// FindNamedAt searches under <rootPath> for every file named <fileName> 241// The reason a caller might use FindNamedAt instead of FindNamed is if they want 242// to limit their search to a subset of the cache 243func (f *Finder) FindNamedAt(rootPath string, fileName string) []string { 244 filter := func(entries DirEntries) (dirNames []string, fileNames []string) { 245 matches := []string{} 246 for _, foundName := range entries.FileNames { 247 if foundName == fileName { 248 matches = append(matches, foundName) 249 } 250 } 251 return entries.DirNames, matches 252 253 } 254 return f.FindMatching(rootPath, filter) 255} 256 257// FindFirstNamed searches for every file named <fileName> 258// Whenever it finds a match, it stops search subdirectories 259func (f *Finder) FindFirstNamed(fileName string) []string { 260 return f.FindFirstNamedAt("/", fileName) 261} 262 263// FindFirstNamedAt searches for every file named <fileName> 264// Whenever it finds a match, it stops search subdirectories 265func (f *Finder) FindFirstNamedAt(rootPath string, fileName string) []string { 266 filter := func(entries DirEntries) (dirNames []string, fileNames []string) { 267 matches := []string{} 268 for _, foundName := range entries.FileNames { 269 if foundName == fileName { 270 matches = append(matches, foundName) 271 } 272 } 273 274 if len(matches) > 0 { 275 return []string{}, matches 276 } 277 return entries.DirNames, matches 278 } 279 return f.FindMatching(rootPath, filter) 280} 281 282// FindMatching is the most general exported function for searching for files in the cache 283// The WalkFunc will be invoked repeatedly and is expected to modify the provided DirEntries 284// in place, removing file paths and directories as desired. 285// WalkFunc will be invoked potentially many times in parallel, and must be threadsafe. 286func (f *Finder) FindMatching(rootPath string, filter WalkFunc) []string { 287 // set up some parameters 288 scanStart := time.Now() 289 var isRel bool 290 workingDir := f.cacheMetadata.Config.WorkingDirectory 291 292 isRel = !filepath.IsAbs(rootPath) 293 if isRel { 294 rootPath = filepath.Join(workingDir, rootPath) 295 } 296 297 rootPath = filepath.Clean(rootPath) 298 299 // ensure nothing else is using the Finder 300 f.verbosef("FindMatching waiting for finder to be idle\n") 301 f.lock() 302 defer f.unlock() 303 304 node := f.nodes.GetNode(rootPath, false) 305 if node == nil { 306 f.verbosef("No data for path %v ; apparently not included in cache params: %v\n", 307 rootPath, f.cacheMetadata.Config.CacheParams) 308 // path is not found; don't do a search 309 return []string{} 310 } 311 312 // search for matching files 313 f.verbosef("Finder finding %v using cache\n", rootPath) 314 results := f.findInCacheMultithreaded(node, filter, f.numSearchingThreads) 315 316 // format and return results 317 if isRel { 318 for i := 0; i < len(results); i++ { 319 results[i] = strings.Replace(results[i], workingDir+"/", "", 1) 320 } 321 } 322 sort.Strings(results) 323 f.verbosef("Found %v files under %v in %v using cache\n", 324 len(results), rootPath, time.Since(scanStart)) 325 return results 326} 327 328// Shutdown declares that the finder is no longer needed and waits for its cleanup to complete 329// Currently, that only entails waiting for the database dump to complete. 330func (f *Finder) Shutdown() { 331 f.WaitForDbDump() 332} 333 334// WaitForDbDump returns once the database has been written to f.DbPath. 335func (f *Finder) WaitForDbDump() { 336 f.shutdownWaitgroup.Wait() 337} 338 339// End of public api 340 341func (f *Finder) goDumpDb() { 342 if f.wasModified() { 343 f.shutdownWaitgroup.Add(1) 344 go func() { 345 err := f.dumpDb() 346 if err != nil { 347 f.verbosef("%v\n", err) 348 } 349 f.shutdownWaitgroup.Done() 350 }() 351 } else { 352 f.verbosef("Skipping dumping unmodified db\n") 353 } 354} 355 356// joinCleanPaths is like filepath.Join but is faster because 357// joinCleanPaths doesn't have to support paths ending in "/" or containing ".." 358func joinCleanPaths(base string, leaf string) string { 359 if base == "" { 360 return leaf 361 } 362 if base == "/" { 363 return base + leaf 364 } 365 if leaf == "" { 366 return base 367 } 368 return base + "/" + leaf 369} 370 371func (f *Finder) verbosef(format string, args ...interface{}) { 372 f.logger.Output(2, fmt.Sprintf(format, args...)) 373} 374 375// loadFromFilesystem populates the in-memory cache based on the contents of the filesystem 376func (f *Finder) loadFromFilesystem() { 377 f.threadPool = newThreadPool(f.numDbLoadingThreads) 378 379 err := f.startFromExternalCache() 380 if err != nil { 381 f.startWithoutExternalCache() 382 } 383 384 f.goDumpDb() 385 386 f.threadPool = nil 387} 388 389func (f *Finder) startFind(path string) { 390 if !filepath.IsAbs(path) { 391 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path) 392 } 393 node := f.nodes.GetNode(path, true) 394 f.statDirAsync(node) 395} 396 397func (f *Finder) lock() { 398 f.mutex.Lock() 399} 400 401func (f *Finder) unlock() { 402 f.mutex.Unlock() 403} 404 405// a statResponse is the relevant portion of the response from the filesystem to a Stat call 406type statResponse struct { 407 ModTime int64 408 Inode uint64 409 Device uint64 410} 411 412// a pathAndStats stores a path and its stats 413type pathAndStats struct { 414 statResponse 415 416 Path string 417} 418 419// a dirFullInfo stores all of the relevant information we know about a directory 420type dirFullInfo struct { 421 pathAndStats 422 423 FileNames []string 424} 425 426// a PersistedDirInfo is the information about a dir that we save to our cache on disk 427type PersistedDirInfo struct { 428 // These field names are short because they are repeated many times in the output json file 429 P string // path 430 T int64 // modification time 431 I uint64 // inode number 432 F []string // relevant filenames contained 433} 434 435// a PersistedDirs is the information that we persist for a group of dirs 436type PersistedDirs struct { 437 // the device on which each directory is stored 438 Device uint64 439 // the common root path to which all contained dirs are relative 440 Root string 441 // the directories themselves 442 Dirs []PersistedDirInfo 443} 444 445// a CacheEntry is the smallest unit that can be read and parsed from the cache (on disk) at a time 446type CacheEntry []PersistedDirs 447 448// a DirEntries lists the files and directories contained directly within a specific directory 449type DirEntries struct { 450 Path string 451 452 // elements of DirNames are just the dir names; they don't include any '/' character 453 DirNames []string 454 // elements of FileNames are just the file names; they don't include '/' character 455 FileNames []string 456} 457 458// a WalkFunc is the type that is passed into various Find functions for determining which 459// directories the caller wishes be walked. The WalkFunc is expected to decide which 460// directories to walk and which files to consider as matches to the original query. 461type WalkFunc func(DirEntries) (dirs []string, files []string) 462 463// a mapNode stores the relevant stats about a directory to be stored in a pathMap 464type mapNode struct { 465 statResponse 466 FileNames []string 467} 468 469// a pathMap implements the directory tree structure of nodes 470type pathMap struct { 471 mapNode 472 473 path string 474 475 children map[string]*pathMap 476 477 // number of descendent nodes, including self 478 approximateNumDescendents int 479} 480 481func newPathMap(path string) *pathMap { 482 result := &pathMap{path: path, children: make(map[string]*pathMap, 4), 483 approximateNumDescendents: 1} 484 return result 485} 486 487// GetNode returns the node at <path> 488func (m *pathMap) GetNode(path string, createIfNotFound bool) *pathMap { 489 if len(path) > 0 && path[0] == '/' { 490 path = path[1:] 491 } 492 493 node := m 494 for { 495 if path == "" { 496 return node 497 } 498 499 index := strings.Index(path, "/") 500 var firstComponent string 501 if index >= 0 { 502 firstComponent = path[:index] 503 path = path[index+1:] 504 } else { 505 firstComponent = path 506 path = "" 507 } 508 509 child, found := node.children[firstComponent] 510 511 if !found { 512 if createIfNotFound { 513 child = node.newChild(firstComponent) 514 } else { 515 return nil 516 } 517 } 518 519 node = child 520 } 521} 522 523func (m *pathMap) newChild(name string) (child *pathMap) { 524 path := joinCleanPaths(m.path, name) 525 newChild := newPathMap(path) 526 m.children[name] = newChild 527 528 return m.children[name] 529} 530 531func (m *pathMap) UpdateNumDescendents() int { 532 count := 1 533 for _, child := range m.children { 534 count += child.approximateNumDescendents 535 } 536 m.approximateNumDescendents = count 537 return count 538} 539 540func (m *pathMap) UpdateNumDescendentsRecursive() { 541 for _, child := range m.children { 542 child.UpdateNumDescendentsRecursive() 543 } 544 m.UpdateNumDescendents() 545} 546 547func (m *pathMap) MergeIn(other *pathMap) { 548 for key, theirs := range other.children { 549 ours, found := m.children[key] 550 if found { 551 ours.MergeIn(theirs) 552 } else { 553 m.children[key] = theirs 554 } 555 } 556 if other.ModTime != 0 { 557 m.mapNode = other.mapNode 558 } 559 m.UpdateNumDescendents() 560} 561 562func (m *pathMap) DumpAll() []dirFullInfo { 563 results := []dirFullInfo{} 564 m.dumpInto("", &results) 565 return results 566} 567 568func (m *pathMap) dumpInto(path string, results *[]dirFullInfo) { 569 *results = append(*results, 570 dirFullInfo{ 571 pathAndStats{statResponse: m.statResponse, Path: path}, 572 m.FileNames}, 573 ) 574 for key, child := range m.children { 575 childPath := joinCleanPaths(path, key) 576 if len(childPath) == 0 || childPath[0] != '/' { 577 childPath = "/" + childPath 578 } 579 child.dumpInto(childPath, results) 580 } 581} 582 583// a semaphore can be locked by up to <capacity> callers at once 584type semaphore struct { 585 pool chan bool 586} 587 588func newSemaphore(capacity int) *semaphore { 589 return &semaphore{pool: make(chan bool, capacity)} 590} 591 592func (l *semaphore) Lock() { 593 l.pool <- true 594} 595 596func (l *semaphore) Unlock() { 597 <-l.pool 598} 599 600// A threadPool runs goroutines and supports throttling and waiting. 601// Without throttling, Go may exhaust the maximum number of various resources, such as 602// threads or file descriptors, and crash the program. 603type threadPool struct { 604 receivedRequests sync.WaitGroup 605 activeRequests semaphore 606} 607 608func newThreadPool(maxNumConcurrentThreads int) *threadPool { 609 return &threadPool{ 610 receivedRequests: sync.WaitGroup{}, 611 activeRequests: *newSemaphore(maxNumConcurrentThreads), 612 } 613} 614 615// Run requests to run the given function in its own goroutine 616func (p *threadPool) Run(function func()) { 617 p.receivedRequests.Add(1) 618 // If Run() was called from within a goroutine spawned by this threadPool, 619 // then we may need to return from Run() before having capacity to actually 620 // run <function>. 621 // 622 // It's possible that the body of <function> contains a statement (such as a syscall) 623 // that will cause Go to pin it to a thread, or will contain a statement that uses 624 // another resource that is in short supply (such as a file descriptor), so we can't 625 // actually run <function> until we have capacity. 626 // 627 // However, the semaphore used for synchronization is implemented via a channel and 628 // shouldn't require a new thread for each access. 629 go func() { 630 p.activeRequests.Lock() 631 function() 632 p.activeRequests.Unlock() 633 p.receivedRequests.Done() 634 }() 635} 636 637// Wait waits until all goroutines are done, just like sync.WaitGroup's Wait 638func (p *threadPool) Wait() { 639 p.receivedRequests.Wait() 640} 641 642type fsErr struct { 643 path string 644 err error 645} 646 647func (e fsErr) String() string { 648 return e.path + ": " + e.err.Error() 649} 650 651func (f *Finder) serializeCacheEntry(dirInfos []dirFullInfo) ([]byte, error) { 652 // group each dirFullInfo by its Device, to avoid having to repeat it in the output 653 dirsByDevice := map[uint64][]PersistedDirInfo{} 654 for _, entry := range dirInfos { 655 _, found := dirsByDevice[entry.Device] 656 if !found { 657 dirsByDevice[entry.Device] = []PersistedDirInfo{} 658 } 659 dirsByDevice[entry.Device] = append(dirsByDevice[entry.Device], 660 PersistedDirInfo{P: entry.Path, T: entry.ModTime, I: entry.Inode, F: entry.FileNames}) 661 } 662 663 cacheEntry := CacheEntry{} 664 665 for device, infos := range dirsByDevice { 666 // find common prefix 667 prefix := "" 668 if len(infos) > 0 { 669 prefix = infos[0].P 670 } 671 for _, info := range infos { 672 for !strings.HasPrefix(info.P+"/", prefix+"/") { 673 prefix = filepath.Dir(prefix) 674 if prefix == "/" { 675 break 676 } 677 } 678 } 679 // remove common prefix 680 for i := range infos { 681 suffix := strings.Replace(infos[i].P, prefix, "", 1) 682 if len(suffix) > 0 && suffix[0] == '/' { 683 suffix = suffix[1:] 684 } 685 infos[i].P = suffix 686 } 687 688 // turn the map (keyed by device) into a list of structs with labeled fields 689 // this is to improve readability of the output 690 cacheEntry = append(cacheEntry, PersistedDirs{Device: device, Root: prefix, Dirs: infos}) 691 } 692 693 // convert to json. 694 // it would save some space to use a different format than json for the db file, 695 // but the space and time savings are small, and json is easy for humans to read 696 bytes, err := json.Marshal(cacheEntry) 697 return bytes, err 698} 699 700func (f *Finder) parseCacheEntry(bytes []byte) ([]dirFullInfo, error) { 701 var cacheEntry CacheEntry 702 err := json.Unmarshal(bytes, &cacheEntry) 703 if err != nil { 704 return nil, err 705 } 706 707 // convert from a CacheEntry to a []dirFullInfo (by copying a few fields) 708 capacity := 0 709 for _, element := range cacheEntry { 710 capacity += len(element.Dirs) 711 } 712 nodes := make([]dirFullInfo, capacity) 713 count := 0 714 for _, element := range cacheEntry { 715 for _, dir := range element.Dirs { 716 path := joinCleanPaths(element.Root, dir.P) 717 718 nodes[count] = dirFullInfo{ 719 pathAndStats: pathAndStats{ 720 statResponse: statResponse{ 721 ModTime: dir.T, Inode: dir.I, Device: element.Device, 722 }, 723 Path: path}, 724 FileNames: dir.F} 725 count++ 726 } 727 } 728 return nodes, nil 729} 730 731// We use the following separator byte to distinguish individually parseable blocks of json 732// because we know this separator won't appear in the json that we're parsing. 733// 734// The newline byte can only appear in a UTF-8 stream if the newline character appears, because: 735// - The newline character is encoded as "0000 1010" in binary ("0a" in hex) 736// - UTF-8 dictates that bytes beginning with a "0" bit are never emitted as part of a multibyte 737// character. 738// 739// We know that the newline character will never appear in our json string, because: 740// - If a newline character appears as part of a data string, then json encoding will 741// emit two characters instead: '\' and 'n'. 742// - The json encoder that we use doesn't emit the optional newlines between any of its 743// other outputs. 744const lineSeparator = byte('\n') 745 746func (f *Finder) readLine(reader *bufio.Reader) ([]byte, error) { 747 return reader.ReadBytes(lineSeparator) 748} 749 750// validateCacheHeader reads the cache header from cacheReader and tells whether the cache is compatible with this Finder 751func (f *Finder) validateCacheHeader(cacheReader *bufio.Reader) bool { 752 cacheVersionBytes, err := f.readLine(cacheReader) 753 if err != nil { 754 f.verbosef("Failed to read database header; database is invalid\n") 755 return false 756 } 757 if len(cacheVersionBytes) > 0 && cacheVersionBytes[len(cacheVersionBytes)-1] == lineSeparator { 758 cacheVersionBytes = cacheVersionBytes[:len(cacheVersionBytes)-1] 759 } 760 cacheVersionString := string(cacheVersionBytes) 761 currentVersion := f.cacheMetadata.Version 762 if cacheVersionString != currentVersion { 763 f.verbosef("Version changed from %q to %q, database is not applicable\n", cacheVersionString, currentVersion) 764 return false 765 } 766 767 cacheParamBytes, err := f.readLine(cacheReader) 768 if err != nil { 769 f.verbosef("Failed to read database search params; database is invalid\n") 770 return false 771 } 772 773 if len(cacheParamBytes) > 0 && cacheParamBytes[len(cacheParamBytes)-1] == lineSeparator { 774 cacheParamBytes = cacheParamBytes[:len(cacheParamBytes)-1] 775 } 776 777 currentParamBytes, err := f.cacheMetadata.Config.Dump() 778 if err != nil { 779 panic("Finder failed to serialize its parameters") 780 } 781 cacheParamString := string(cacheParamBytes) 782 currentParamString := string(currentParamBytes) 783 if cacheParamString != currentParamString { 784 f.verbosef("Params changed from %q to %q, database is not applicable\n", cacheParamString, currentParamString) 785 return false 786 } 787 return true 788} 789 790// loadBytes compares the cache info in <data> to the state of the filesystem 791// loadBytes returns a map representing <data> and also a slice of dirs that need to be re-walked 792func (f *Finder) loadBytes(id int, data []byte) (m *pathMap, dirsToWalk []string, err error) { 793 794 helperStartTime := time.Now() 795 796 cachedNodes, err := f.parseCacheEntry(data) 797 if err != nil { 798 return nil, nil, fmt.Errorf("Failed to parse block %v: %v\n", id, err.Error()) 799 } 800 801 unmarshalDate := time.Now() 802 f.verbosef("Unmarshaled %v objects for %v in %v\n", 803 len(cachedNodes), id, unmarshalDate.Sub(helperStartTime)) 804 805 tempMap := newPathMap("/") 806 stats := make([]statResponse, len(cachedNodes)) 807 808 for i, node := range cachedNodes { 809 // check the file system for an updated timestamp 810 stats[i] = f.statDirSync(node.Path) 811 } 812 813 dirsToWalk = []string{} 814 for i, cachedNode := range cachedNodes { 815 updated := stats[i] 816 // save the cached value 817 container := tempMap.GetNode(cachedNode.Path, true) 818 container.mapNode = mapNode{statResponse: updated} 819 820 // if the metadata changed and the directory still exists, then 821 // make a note to walk it later 822 if !f.isInfoUpToDate(cachedNode.statResponse, updated) && updated.ModTime != 0 { 823 f.setModified() 824 // make a note that the directory needs to be walked 825 dirsToWalk = append(dirsToWalk, cachedNode.Path) 826 } else { 827 container.mapNode.FileNames = cachedNode.FileNames 828 } 829 } 830 // count the number of nodes to improve our understanding of the shape of the tree, 831 // thereby improving parallelism of subsequent searches 832 tempMap.UpdateNumDescendentsRecursive() 833 834 f.verbosef("Statted inodes of block %v in %v\n", id, time.Now().Sub(unmarshalDate)) 835 return tempMap, dirsToWalk, nil 836} 837 838// startFromExternalCache loads the cache database from disk 839// startFromExternalCache waits to return until the load of the cache db is complete, but 840// startFromExternalCache does not wait for all every listDir() or statDir() request to complete 841func (f *Finder) startFromExternalCache() (err error) { 842 startTime := time.Now() 843 dbPath := f.DbPath 844 845 // open cache file and validate its header 846 reader, err := f.filesystem.Open(dbPath) 847 if err != nil { 848 return errors.New("No data to load from database\n") 849 } 850 defer reader.Close() 851 bufferedReader := bufio.NewReader(reader) 852 if !f.validateCacheHeader(bufferedReader) { 853 return errors.New("Cache header does not match") 854 } 855 f.verbosef("Database header matches, will attempt to use database %v\n", f.DbPath) 856 857 // read the file and spawn threads to process it 858 nodesToWalk := [][]*pathMap{} 859 mainTree := newPathMap("/") 860 861 // read the blocks and stream them into <blockChannel> 862 type dataBlock struct { 863 id int 864 err error 865 data []byte 866 } 867 blockChannel := make(chan dataBlock, f.numDbLoadingThreads) 868 readBlocks := func() { 869 index := 0 870 for { 871 // It takes some time to unmarshal the input from json, so we want 872 // to unmarshal it in parallel. In order to find valid places to 873 // break the input, we scan for the line separators that we inserted 874 // (for this purpose) when we dumped the database. 875 data, err := f.readLine(bufferedReader) 876 var response dataBlock 877 done := false 878 if err != nil && err != io.EOF { 879 response = dataBlock{id: index, err: err, data: nil} 880 done = true 881 } else { 882 done = (err == io.EOF) 883 response = dataBlock{id: index, err: nil, data: data} 884 } 885 blockChannel <- response 886 index++ 887 duration := time.Since(startTime) 888 f.verbosef("Read block %v after %v\n", index, duration) 889 if done { 890 f.verbosef("Read %v blocks in %v\n", index, duration) 891 close(blockChannel) 892 return 893 } 894 } 895 } 896 go readBlocks() 897 898 // Read from <blockChannel> and stream the responses into <resultChannel>. 899 type workResponse struct { 900 id int 901 err error 902 tree *pathMap 903 updatedDirs []string 904 } 905 resultChannel := make(chan workResponse) 906 processBlocks := func() { 907 numProcessed := 0 908 threadPool := newThreadPool(f.numDbLoadingThreads) 909 for { 910 // get a block to process 911 block, received := <-blockChannel 912 if !received { 913 break 914 } 915 916 if block.err != nil { 917 resultChannel <- workResponse{err: block.err} 918 break 919 } 920 numProcessed++ 921 // wait until there is CPU available to process it 922 threadPool.Run( 923 func() { 924 processStartTime := time.Now() 925 f.verbosef("Starting to process block %v after %v\n", 926 block.id, processStartTime.Sub(startTime)) 927 tempMap, updatedDirs, err := f.loadBytes(block.id, block.data) 928 var response workResponse 929 if err != nil { 930 f.verbosef( 931 "Block %v failed to parse with error %v\n", 932 block.id, err) 933 response = workResponse{err: err} 934 } else { 935 response = workResponse{ 936 id: block.id, 937 err: nil, 938 tree: tempMap, 939 updatedDirs: updatedDirs, 940 } 941 } 942 f.verbosef("Processed block %v in %v\n", 943 block.id, time.Since(processStartTime), 944 ) 945 resultChannel <- response 946 }, 947 ) 948 } 949 threadPool.Wait() 950 f.verbosef("Finished processing %v blocks in %v\n", 951 numProcessed, time.Since(startTime)) 952 close(resultChannel) 953 } 954 go processBlocks() 955 956 // Read from <resultChannel> and use the results 957 combineResults := func() (err error) { 958 for { 959 result, received := <-resultChannel 960 if !received { 961 break 962 } 963 if err != nil { 964 // In case of an error, wait for work to complete before 965 // returning the error. This ensures that any subsequent 966 // work doesn't need to compete for resources (and possibly 967 // fail due to, for example, a filesystem limit on the number of 968 // concurrently open files) with past work. 969 continue 970 } 971 if result.err != nil { 972 err = result.err 973 continue 974 } 975 // update main tree 976 mainTree.MergeIn(result.tree) 977 // record any new directories that we will need to Stat() 978 updatedNodes := make([]*pathMap, len(result.updatedDirs)) 979 for j, dir := range result.updatedDirs { 980 node := mainTree.GetNode(dir, false) 981 updatedNodes[j] = node 982 } 983 nodesToWalk = append(nodesToWalk, updatedNodes) 984 } 985 return err 986 } 987 err = combineResults() 988 if err != nil { 989 return err 990 } 991 992 f.nodes = *mainTree 993 994 // after having loaded the entire db and therefore created entries for 995 // the directories we know of, now it's safe to start calling ReadDir on 996 // any updated directories 997 for i := range nodesToWalk { 998 f.listDirsAsync(nodesToWalk[i]) 999 } 1000 f.verbosef("Loaded db and statted known dirs in %v\n", time.Since(startTime)) 1001 f.threadPool.Wait() 1002 f.verbosef("Loaded db and statted all dirs in %v\n", time.Now().Sub(startTime)) 1003 1004 return err 1005} 1006 1007// startWithoutExternalCache starts scanning the filesystem according to the cache config 1008// startWithoutExternalCache should be called if startFromExternalCache is not applicable 1009func (f *Finder) startWithoutExternalCache() { 1010 startTime := time.Now() 1011 configDirs := f.cacheMetadata.Config.RootDirs 1012 1013 // clean paths 1014 candidates := make([]string, len(configDirs)) 1015 for i, dir := range configDirs { 1016 candidates[i] = filepath.Clean(dir) 1017 } 1018 // remove duplicates 1019 dirsToScan := make([]string, 0, len(configDirs)) 1020 for _, candidate := range candidates { 1021 include := true 1022 for _, included := range dirsToScan { 1023 if included == "/" || strings.HasPrefix(candidate+"/", included+"/") { 1024 include = false 1025 break 1026 } 1027 } 1028 if include { 1029 dirsToScan = append(dirsToScan, candidate) 1030 } 1031 } 1032 1033 // start searching finally 1034 for _, path := range dirsToScan { 1035 f.verbosef("Starting find of %v\n", path) 1036 f.startFind(path) 1037 } 1038 1039 f.threadPool.Wait() 1040 1041 f.verbosef("Scanned filesystem (not using cache) in %v\n", time.Now().Sub(startTime)) 1042} 1043 1044// isInfoUpToDate tells whether <new> can confirm that results computed at <old> are still valid 1045func (f *Finder) isInfoUpToDate(old statResponse, new statResponse) (equal bool) { 1046 if old.Inode != new.Inode { 1047 return false 1048 } 1049 if old.ModTime != new.ModTime { 1050 return false 1051 } 1052 if old.Device != new.Device { 1053 return false 1054 } 1055 return true 1056} 1057 1058func (f *Finder) wasModified() bool { 1059 return atomic.LoadInt32(&f.modifiedFlag) > 0 1060} 1061 1062func (f *Finder) setModified() { 1063 var newVal int32 1064 newVal = 1 1065 atomic.StoreInt32(&f.modifiedFlag, newVal) 1066} 1067 1068// sortedDirEntries exports directory entries to facilitate dumping them to the external cache 1069func (f *Finder) sortedDirEntries() []dirFullInfo { 1070 startTime := time.Now() 1071 nodes := make([]dirFullInfo, 0) 1072 for _, node := range f.nodes.DumpAll() { 1073 if node.ModTime != 0 { 1074 nodes = append(nodes, node) 1075 } 1076 } 1077 discoveryDate := time.Now() 1078 f.verbosef("Generated %v cache entries in %v\n", len(nodes), discoveryDate.Sub(startTime)) 1079 less := func(i int, j int) bool { 1080 return nodes[i].Path < nodes[j].Path 1081 } 1082 sort.Slice(nodes, less) 1083 sortDate := time.Now() 1084 f.verbosef("Sorted %v cache entries in %v\n", len(nodes), sortDate.Sub(discoveryDate)) 1085 1086 return nodes 1087} 1088 1089// serializeDb converts the cache database into a form to save to disk 1090func (f *Finder) serializeDb() ([]byte, error) { 1091 // sort dir entries 1092 var entryList = f.sortedDirEntries() 1093 1094 // Generate an output file that can be conveniently loaded using the same number of threads 1095 // as were used in this execution (because presumably that will be the number of threads 1096 // used in the next execution too) 1097 1098 // generate header 1099 header := []byte{} 1100 header = append(header, []byte(f.cacheMetadata.Version)...) 1101 header = append(header, lineSeparator) 1102 configDump, err := f.cacheMetadata.Config.Dump() 1103 if err != nil { 1104 return nil, err 1105 } 1106 header = append(header, configDump...) 1107 1108 // serialize individual blocks in parallel 1109 numBlocks := f.numDbLoadingThreads 1110 if numBlocks > len(entryList) { 1111 numBlocks = len(entryList) 1112 } 1113 blocks := make([][]byte, 1+numBlocks) 1114 blocks[0] = header 1115 blockMin := 0 1116 wg := sync.WaitGroup{} 1117 var errLock sync.Mutex 1118 1119 for i := 1; i <= numBlocks; i++ { 1120 // identify next block 1121 blockMax := len(entryList) * i / numBlocks 1122 block := entryList[blockMin:blockMax] 1123 1124 // process block 1125 wg.Add(1) 1126 go func(index int, block []dirFullInfo) { 1127 byteBlock, subErr := f.serializeCacheEntry(block) 1128 f.verbosef("Serialized block %v into %v bytes\n", index, len(byteBlock)) 1129 if subErr != nil { 1130 f.verbosef("%v\n", subErr.Error()) 1131 errLock.Lock() 1132 err = subErr 1133 errLock.Unlock() 1134 } else { 1135 blocks[index] = byteBlock 1136 } 1137 wg.Done() 1138 }(i, block) 1139 1140 blockMin = blockMax 1141 } 1142 1143 wg.Wait() 1144 1145 if err != nil { 1146 return nil, err 1147 } 1148 1149 content := bytes.Join(blocks, []byte{lineSeparator}) 1150 1151 return content, nil 1152} 1153 1154// dumpDb saves the cache database to disk 1155func (f *Finder) dumpDb() error { 1156 startTime := time.Now() 1157 f.verbosef("Dumping db\n") 1158 1159 tempPath := f.DbPath + ".tmp" 1160 1161 bytes, err := f.serializeDb() 1162 if err != nil { 1163 return err 1164 } 1165 serializeDate := time.Now() 1166 f.verbosef("Serialized db in %v\n", serializeDate.Sub(startTime)) 1167 // dump file and atomically move 1168 err = f.filesystem.WriteFile(tempPath, bytes, 0777) 1169 if err != nil { 1170 return err 1171 } 1172 err = f.filesystem.Rename(tempPath, f.DbPath) 1173 if err != nil { 1174 return err 1175 } 1176 1177 f.verbosef("Wrote db in %v\n", time.Now().Sub(serializeDate)) 1178 return nil 1179 1180} 1181 1182// canIgnoreFsErr checks for certain classes of filesystem errors that are safe to ignore 1183func (f *Finder) canIgnoreFsErr(err error) bool { 1184 pathErr, isPathErr := err.(*os.PathError) 1185 if !isPathErr { 1186 // Don't recognize this error 1187 return false 1188 } 1189 if os.IsPermission(pathErr) { 1190 // Permission errors are ignored: 1191 // https://issuetracker.google.com/37553659 1192 // https://github.com/google/kati/pull/116 1193 return true 1194 } 1195 if pathErr.Err == os.ErrNotExist { 1196 // If a directory doesn't exist, that generally means the cache is out-of-date 1197 return true 1198 } 1199 // Don't recognize this error 1200 return false 1201} 1202 1203// onFsError should be called whenever a potentially fatal error is returned from a filesystem call 1204func (f *Finder) onFsError(path string, err error) { 1205 if !f.canIgnoreFsErr(err) { 1206 // We could send the errors through a channel instead, although that would cause this call 1207 // to block unless we preallocated a sufficient buffer or spawned a reader thread. 1208 // Although it wouldn't be too complicated to spawn a reader thread, it's still slightly 1209 // more convenient to use a lock. Only in an unusual situation should this code be 1210 // invoked anyway. 1211 f.errlock.Lock() 1212 f.fsErrs = append(f.fsErrs, fsErr{path: path, err: err}) 1213 f.errlock.Unlock() 1214 } 1215} 1216 1217// discardErrsForPrunedPaths removes any errors for paths that are no longer included in the cache 1218func (f *Finder) discardErrsForPrunedPaths() { 1219 // This function could be somewhat inefficient due to being single-threaded, 1220 // but the length of f.fsErrs should be approximately 0, so it shouldn't take long anyway. 1221 relevantErrs := make([]fsErr, 0, len(f.fsErrs)) 1222 for _, fsErr := range f.fsErrs { 1223 path := fsErr.path 1224 node := f.nodes.GetNode(path, false) 1225 if node != nil { 1226 // The path in question wasn't pruned due to a failure to process a parent directory. 1227 // So, the failure to process this path is important 1228 relevantErrs = append(relevantErrs, fsErr) 1229 } 1230 } 1231 f.fsErrs = relevantErrs 1232} 1233 1234// getErr returns an error based on previous calls to onFsErr, if any 1235func (f *Finder) getErr() error { 1236 f.discardErrsForPrunedPaths() 1237 1238 numErrs := len(f.fsErrs) 1239 if numErrs < 1 { 1240 return nil 1241 } 1242 1243 maxNumErrsToInclude := 10 1244 message := "" 1245 if numErrs > maxNumErrsToInclude { 1246 message = fmt.Sprintf("finder encountered %v errors: %v...", numErrs, f.fsErrs[:maxNumErrsToInclude]) 1247 } else { 1248 message = fmt.Sprintf("finder encountered %v errors: %v", numErrs, f.fsErrs) 1249 } 1250 1251 return errors.New(message) 1252} 1253 1254func (f *Finder) statDirAsync(dir *pathMap) { 1255 node := dir 1256 path := dir.path 1257 f.threadPool.Run( 1258 func() { 1259 updatedStats := f.statDirSync(path) 1260 1261 if !f.isInfoUpToDate(node.statResponse, updatedStats) { 1262 node.mapNode = mapNode{ 1263 statResponse: updatedStats, 1264 FileNames: []string{}, 1265 } 1266 f.setModified() 1267 if node.statResponse.ModTime != 0 { 1268 // modification time was updated, so re-scan for 1269 // child directories 1270 f.listDirAsync(dir) 1271 } 1272 } 1273 }, 1274 ) 1275} 1276 1277func (f *Finder) statDirSync(path string) statResponse { 1278 1279 fileInfo, err := f.filesystem.Lstat(path) 1280 1281 var stats statResponse 1282 if err != nil { 1283 // possibly record this error 1284 f.onFsError(path, err) 1285 // in case of a failure to stat the directory, treat the directory as missing (modTime = 0) 1286 return stats 1287 } 1288 modTime := fileInfo.ModTime() 1289 stats = statResponse{} 1290 inode, err := f.filesystem.InodeNumber(fileInfo) 1291 if err != nil { 1292 panic(fmt.Sprintf("Could not get inode number of %v: %v\n", path, err.Error())) 1293 } 1294 stats.Inode = inode 1295 device, err := f.filesystem.DeviceNumber(fileInfo) 1296 if err != nil { 1297 panic(fmt.Sprintf("Could not get device number of %v: %v\n", path, err.Error())) 1298 } 1299 stats.Device = device 1300 permissionsChangeTime, err := f.filesystem.PermTime(fileInfo) 1301 1302 if err != nil { 1303 panic(fmt.Sprintf("Could not get permissions modification time (CTime) of %v: %v\n", path, err.Error())) 1304 } 1305 // We're only interested in knowing whether anything about the directory 1306 // has changed since last check, so we use the latest of the two 1307 // modification times (content modification (mtime) and 1308 // permission modification (ctime)) 1309 if permissionsChangeTime.After(modTime) { 1310 modTime = permissionsChangeTime 1311 } 1312 stats.ModTime = modTime.UnixNano() 1313 1314 return stats 1315} 1316 1317func (f *Finder) shouldIncludeFile(fileName string) bool { 1318 for _, includedName := range f.cacheMetadata.Config.IncludeFiles { 1319 if fileName == includedName { 1320 return true 1321 } 1322 } 1323 for _, includeSuffix := range f.cacheMetadata.Config.IncludeSuffixes { 1324 if strings.HasSuffix(fileName, includeSuffix) { 1325 return true 1326 } 1327 } 1328 return false 1329} 1330 1331// pruneCacheCandidates removes the items that we don't want to include in our persistent cache 1332func (f *Finder) pruneCacheCandidates(items *DirEntries) { 1333 1334 for _, fileName := range items.FileNames { 1335 for _, abortedName := range f.cacheMetadata.Config.PruneFiles { 1336 if fileName == abortedName { 1337 items.FileNames = []string{} 1338 items.DirNames = []string{} 1339 return 1340 } 1341 } 1342 } 1343 1344 // remove any files that aren't the ones we want to include 1345 writeIndex := 0 1346 for _, fileName := range items.FileNames { 1347 if f.shouldIncludeFile(fileName) { 1348 items.FileNames[writeIndex] = fileName 1349 writeIndex++ 1350 } 1351 } 1352 // resize 1353 items.FileNames = items.FileNames[:writeIndex] 1354 1355 writeIndex = 0 1356 for _, dirName := range items.DirNames { 1357 items.DirNames[writeIndex] = dirName 1358 // ignore other dirs that are known to not be inputs to the build process 1359 include := true 1360 for _, excludedName := range f.cacheMetadata.Config.ExcludeDirs { 1361 if dirName == excludedName { 1362 // don't include 1363 include = false 1364 break 1365 } 1366 } 1367 if include { 1368 writeIndex++ 1369 } 1370 } 1371 // resize 1372 items.DirNames = items.DirNames[:writeIndex] 1373} 1374 1375func (f *Finder) listDirsAsync(nodes []*pathMap) { 1376 f.threadPool.Run( 1377 func() { 1378 for i := range nodes { 1379 f.listDirSync(nodes[i]) 1380 } 1381 }, 1382 ) 1383} 1384 1385func (f *Finder) listDirAsync(node *pathMap) { 1386 f.threadPool.Run( 1387 func() { 1388 f.listDirSync(node) 1389 }, 1390 ) 1391} 1392 1393func (f *Finder) listDirSync(dir *pathMap) { 1394 path := dir.path 1395 children, err := f.filesystem.ReadDir(path) 1396 1397 if err != nil { 1398 // possibly record this error 1399 f.onFsError(path, err) 1400 // if listing the contents of the directory fails (presumably due to 1401 // permission denied), then treat the directory as empty 1402 children = nil 1403 } 1404 1405 var subdirs []string 1406 var subfiles []string 1407 1408 for _, child := range children { 1409 linkBits := child.Mode() & os.ModeSymlink 1410 isLink := linkBits != 0 1411 if isLink { 1412 childPath := filepath.Join(path, child.Name()) 1413 childStat, err := f.filesystem.Stat(childPath) 1414 if err != nil { 1415 // If stat fails this is probably a broken or dangling symlink, treat it as a file. 1416 subfiles = append(subfiles, child.Name()) 1417 } else if childStat.IsDir() { 1418 // Skip symlink dirs. 1419 // We don't have to support symlink dirs because 1420 // that would cause duplicates. 1421 } else { 1422 // We do have to support symlink files because the link name might be 1423 // different than the target name 1424 // (for example, Android.bp -> build/soong/root.bp) 1425 subfiles = append(subfiles, child.Name()) 1426 } 1427 } else if child.IsDir() { 1428 subdirs = append(subdirs, child.Name()) 1429 } else { 1430 subfiles = append(subfiles, child.Name()) 1431 } 1432 1433 } 1434 parentNode := dir 1435 1436 entry := &DirEntries{Path: path, DirNames: subdirs, FileNames: subfiles} 1437 f.pruneCacheCandidates(entry) 1438 1439 // create a pathMap node for each relevant subdirectory 1440 relevantChildren := map[string]*pathMap{} 1441 for _, subdirName := range entry.DirNames { 1442 childNode, found := parentNode.children[subdirName] 1443 // if we already knew of this directory, then we already have a request pending to Stat it 1444 // if we didn't already know of this directory, then we must Stat it now 1445 if !found { 1446 childNode = parentNode.newChild(subdirName) 1447 f.statDirAsync(childNode) 1448 } 1449 relevantChildren[subdirName] = childNode 1450 } 1451 // Note that in rare cases, it's possible that we're reducing the set of 1452 // children via this statement, if these are all true: 1453 // 1. we previously had a cache that knew about subdirectories of parentNode 1454 // 2. the user created a prune-file (described in pruneCacheCandidates) 1455 // inside <parentNode>, which specifies that the contents of parentNode 1456 // are to be ignored. 1457 // The fact that it's possible to remove children here means that *pathMap structs 1458 // must not be looked up from f.nodes by filepath (and instead must be accessed by 1459 // direct pointer) until after every listDirSync completes 1460 parentNode.FileNames = entry.FileNames 1461 parentNode.children = relevantChildren 1462 1463} 1464 1465// listMatches takes a node and a function that specifies which subdirectories and 1466// files to include, and listMatches returns the matches 1467func (f *Finder) listMatches(node *pathMap, 1468 filter WalkFunc) (subDirs []*pathMap, filePaths []string) { 1469 entries := DirEntries{ 1470 FileNames: node.FileNames, 1471 } 1472 entries.DirNames = make([]string, 0, len(node.children)) 1473 for childName := range node.children { 1474 entries.DirNames = append(entries.DirNames, childName) 1475 } 1476 1477 dirNames, fileNames := filter(entries) 1478 1479 subDirs = []*pathMap{} 1480 filePaths = make([]string, 0, len(fileNames)) 1481 for _, fileName := range fileNames { 1482 filePaths = append(filePaths, joinCleanPaths(node.path, fileName)) 1483 } 1484 subDirs = make([]*pathMap, 0, len(dirNames)) 1485 for _, childName := range dirNames { 1486 child, ok := node.children[childName] 1487 if ok { 1488 subDirs = append(subDirs, child) 1489 } 1490 } 1491 1492 return subDirs, filePaths 1493} 1494 1495// findInCacheMultithreaded spawns potentially multiple goroutines with which to search the cache. 1496func (f *Finder) findInCacheMultithreaded(node *pathMap, filter WalkFunc, 1497 approxNumThreads int) []string { 1498 1499 if approxNumThreads < 2 { 1500 // Done spawning threads; process remaining directories 1501 return f.findInCacheSinglethreaded(node, filter) 1502 } 1503 1504 totalWork := 0 1505 for _, child := range node.children { 1506 totalWork += child.approximateNumDescendents 1507 } 1508 childrenResults := make(chan []string, len(node.children)) 1509 1510 subDirs, filePaths := f.listMatches(node, filter) 1511 1512 // process child directories 1513 for _, child := range subDirs { 1514 numChildThreads := approxNumThreads * child.approximateNumDescendents / totalWork 1515 childProcessor := func(child *pathMap) { 1516 childResults := f.findInCacheMultithreaded(child, filter, numChildThreads) 1517 childrenResults <- childResults 1518 } 1519 // If we're allowed to use more than 1 thread to process this directory, 1520 // then instead we use 1 thread for each subdirectory. 1521 // It would be strange to spawn threads for only some subdirectories. 1522 go childProcessor(child) 1523 } 1524 1525 // collect results 1526 for i := 0; i < len(subDirs); i++ { 1527 childResults := <-childrenResults 1528 filePaths = append(filePaths, childResults...) 1529 } 1530 close(childrenResults) 1531 1532 return filePaths 1533} 1534 1535// findInCacheSinglethreaded synchronously searches the cache for all matching file paths 1536// note findInCacheSinglethreaded runs 2X to 4X as fast by being iterative rather than recursive 1537func (f *Finder) findInCacheSinglethreaded(node *pathMap, filter WalkFunc) []string { 1538 if node == nil { 1539 return []string{} 1540 } 1541 1542 nodes := []*pathMap{node} 1543 matches := []string{} 1544 1545 for len(nodes) > 0 { 1546 currentNode := nodes[0] 1547 nodes = nodes[1:] 1548 1549 subDirs, filePaths := f.listMatches(currentNode, filter) 1550 1551 nodes = append(nodes, subDirs...) 1552 1553 matches = append(matches, filePaths...) 1554 } 1555 return matches 1556} 1557