From 489f7b0cb287605d71a0f8f686f159f497b726e7 Mon Sep 17 00:00:00 2001 From: Ethan Koenig Date: Fri, 17 Nov 2017 22:41:40 -0800 Subject: [PATCH 1/4] Faster commit lookup --- commit_info.go | 304 ++++++++++++++++++++++++++++++++++++++++++++ commit_info_test.go | 59 +++++++++ tree_entry.go | 114 ----------------- tree_entry_test.go | 54 -------- 4 files changed, 363 insertions(+), 168 deletions(-) create mode 100644 commit_info.go create mode 100644 commit_info_test.go diff --git a/commit_info.go b/commit_info.go new file mode 100644 index 000000000..c01e22ef6 --- /dev/null +++ b/commit_info.go @@ -0,0 +1,304 @@ +// Copyright 2017 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package git + +import ( + "bufio" + "context" + "fmt" + "os/exec" + "path" + "runtime" + "strconv" + "strings" + "sync" + "time" +) + +const ( + // parameters for searching for commit infos. If the untargeted search has + // not found any entries in the past 5 commits, and 12 or fewer entries + // remain, then we'll just let the targeted-searching threads finish off, + // and stop the untargeted search to not interfere. + deferToTargetedSearchColdStreak = 5 + deferToTargetedSearchNumRemainingEntries = 12 +) + +// getCommitsInfoState shared state while getting commit info for entries +type getCommitsInfoState struct { + lock sync.Mutex + /* read-only fields, can be read without the mutex */ + // entries and entryPaths are read-only after initialization, so they can + // safely be read without the mutex + entries []*TreeEntry + // set of filepaths to get info for + entryPaths map[string]struct{} + treePath string + headCommit *Commit + + /* mutable fields, must hold mutex to read or write */ + // map from filepath to commit + commits map[string]*Commit + // set of filepaths that have been or are being searched for in a target search + targetedPaths map[string]struct{} +} + +func (state *getCommitsInfoState) numRemainingEntries() int { + state.lock.Lock() + numRemaining := len(state.entries) - len(state.commits) + state.lock.Unlock() + return numRemaining +} + +func (state *getCommitsInfoState) getTargetedEntryPath() string { + var targetedEntryPath string + state.lock.Lock() + for _, entry := range state.entries { + entryPath := path.Join(state.treePath, entry.Name()) + if _, ok := state.commits[entryPath]; ok { + continue + } else if _, ok = state.targetedPaths[entryPath]; ok { + continue + } + targetedEntryPath = entryPath + state.targetedPaths[entryPath] = struct{}{} + break + } + state.lock.Unlock() + return targetedEntryPath +} + +// repeatedly perform targeted searches for unpopulated entries +func targetedSearch(state *getCommitsInfoState, done chan error) { + for { + entryPath := state.getTargetedEntryPath() + if len(entryPath) == 0 { + done <- nil + return + } + command := NewCommand("rev-list", "-1", "HEAD", "--", entryPath) + output, err := command.RunInDir(state.headCommit.repo.Path) + if err != nil { + done <- err + return + } + id, err := NewIDFromString(strings.TrimSpace(output)) + if err != nil { + done <- err + return + } + commit, err := state.headCommit.repo.getCommit(id) + if err != nil { + done <- err + return + } + state.update(entryPath, commit) + } +} + +func initGetCommitInfoState(entries Entries, headCommit *Commit, treePath string) *getCommitsInfoState { + entryPaths := make(map[string]struct{}, len(entries)) + for _, entry := range entries { + entryPaths[path.Join(treePath, entry.Name())] = struct{}{} + } + if treePath = path.Clean(treePath); treePath == "." { + treePath = "" + } + return &getCommitsInfoState{ + entries: entries, + entryPaths: entryPaths, + commits: make(map[string]*Commit, len(entries)), + targetedPaths: make(map[string]struct{}, len(entries)), + treePath: treePath, + headCommit: headCommit, + } +} + +// GetCommitsInfo gets information of all commits that are corresponding to these entries +func (tes Entries) GetCommitsInfo(commit *Commit, treePath string) ([][]interface{}, error) { + state := initGetCommitInfoState(tes, commit, treePath) + if err := getCommitsInfo(state); err != nil { + return nil, err + } + if len(state.commits) < len(state.entryPaths) { + return nil, fmt.Errorf("could not find commits for all entries") + } + + commitsInfo := make([][]interface{}, len(tes)) + for i, entry := range tes { + commit, ok := state.commits[path.Join(treePath, entry.Name())] + if !ok { + return nil, fmt.Errorf("could not find commit for %s", entry.Name()) + } + switch entry.Type { + case ObjectCommit: + subModuleURL := "" + if subModule, err := state.headCommit.GetSubModule(entry.Name()); err != nil { + return nil, err + } else if subModule != nil { + subModuleURL = subModule.URL + } + subModuleFile := NewSubModuleFile(commit, subModuleURL, entry.ID.String()) + commitsInfo[i] = []interface{}{entry, subModuleFile} + default: + commitsInfo[i] = []interface{}{entry, commit} + } + } + return commitsInfo, nil +} + +func (state *getCommitsInfoState) cleanEntryPath(rawEntryPath string) (string, error) { + if rawEntryPath[0] == '"' { + var err error + rawEntryPath, err = strconv.Unquote(rawEntryPath) + if err != nil { + return rawEntryPath, err + } + } + var entryNameStartIndex int + if len(state.treePath) > 0 { + entryNameStartIndex = len(state.treePath) + 1 + } + + if index := strings.IndexByte(rawEntryPath[entryNameStartIndex:], '/'); index >= 0 { + return rawEntryPath[:entryNameStartIndex+index], nil + } + return rawEntryPath, nil +} + +// update report that the given path was last modified by the given commit. +// Returns whether state.commits was updated +func (state *getCommitsInfoState) update(entryPath string, commit *Commit) bool { + if _, ok := state.entryPaths[entryPath]; !ok { + return false + } + + var updated bool + state.lock.Lock() + if _, ok := state.commits[entryPath]; !ok { + state.commits[entryPath] = commit + updated = true + } + state.lock.Unlock() + return updated +} + +const getCommitsInfoPretty = "--pretty=format:%H %ct %s" + +func getCommitsInfo(state *getCommitsInfoState) error { + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute) + defer cancel() + + args := []string{"log", getCommitsInfoPretty, "--name-status", "-c"} + if len(state.treePath) > 0 { + args = append(args, "--", state.treePath) + } + cmd := exec.CommandContext(ctx, "git", args...) + cmd.Dir = state.headCommit.repo.Path + + readCloser, err := cmd.StdoutPipe() + if err != nil { + return err + } + + if err := cmd.Start(); err != nil { + return err + } + + numThreads := runtime.NumCPU() + done := make(chan error, numThreads) + for i := 0; i < numThreads; i++ { + go targetedSearch(state, done) + } + + scanner := bufio.NewScanner(readCloser) + err = state.processGitLogOutput(scanner) + for i := 0; i < numThreads; i++ { + doneErr := <-done + if doneErr != nil && err == nil { + err = doneErr + } + } + return err +} + +func (state *getCommitsInfoState) processGitLogOutput(scanner *bufio.Scanner) error { + // keep a local cache of seen paths to avoid acquiring a lock for paths + // we've already seen + seenPaths := make(map[string]struct{}, len(state.entryPaths)) + // number of consecutive commits without any finds + coldStreak := 0 + var commit *Commit + var err error + for scanner.Scan() { + line := scanner.Text() + if len(line) == 0 { // in-between commits + numRemainingEntries := state.numRemainingEntries() + if numRemainingEntries == 0 { + break + } + if coldStreak >= deferToTargetedSearchColdStreak && + numRemainingEntries <= deferToTargetedSearchNumRemainingEntries { + // stop this untargeted search, and let the targeted-search threads + // finish the work + break + } + continue + } + if line[0] >= 'A' && line[0] <= 'X' { // a file was changed by the current commit + tabIndex := strings.IndexByte(line, '\t') + if tabIndex < 1 { + return fmt.Errorf("misformatted line: %s", line) + } + entryPath, err := state.cleanEntryPath(line[tabIndex+1:]) + if err != nil { + return err + } + if _, ok := seenPaths[entryPath]; !ok { + if state.update(entryPath, commit) { + coldStreak = 0 + } + seenPaths[entryPath] = struct{}{} + } + continue + } + + // a new commit + commit, err = parseCommitInfo(line) + if err != nil { + return err + } + coldStreak++ + } + return scanner.Err() +} + +// parseCommitInfo parse a commit from a line of `git log` output. Expects the +// line to be formatted according to getCommitsInfoPretty. +func parseCommitInfo(line string) (*Commit, error) { + if len(line) < 43 { + return nil, fmt.Errorf("invalid git output: %s", line) + } + ref, err := NewIDFromString(line[:40]) + if err != nil { + return nil, err + } + spaceIndex := strings.IndexByte(line[41:], ' ') + if spaceIndex < 0 { + return nil, fmt.Errorf("invalid git output: %s", line) + } + unixSeconds, err := strconv.Atoi(line[41 : 41+spaceIndex]) + if err != nil { + return nil, err + } + message := line[spaceIndex+42:] + return &Commit{ + ID: ref, + CommitMessage: message, + Committer: &Signature{ + When: time.Unix(int64(unixSeconds), 0), + }, + }, nil +} diff --git a/commit_info_test.go b/commit_info_test.go new file mode 100644 index 000000000..458c8e61b --- /dev/null +++ b/commit_info_test.go @@ -0,0 +1,59 @@ +package git + +import ( + "os" + "path/filepath" + "testing" + "time" +) + +const benchmarkReposDir = "benchmark/repos/" + +func setupGitRepo(url string, name string) (string, error) { + repoDir := filepath.Join(benchmarkReposDir, name) + if _, err := os.Stat(repoDir); err == nil { + return repoDir, nil + } + return repoDir, Clone(url, repoDir, CloneRepoOptions{ + Mirror: false, + Bare: false, + Quiet: true, + Timeout: 5 * time.Minute, + }) +} + +func BenchmarkEntries_GetCommitsInfo(b *testing.B) { + benchmarks := []struct { + url string + name string + }{ + {url: "https://github.com/go-gitea/gitea.git", name: "gitea"}, + {url: "https://github.com/ethantkoenig/manyfiles.git", name: "manyfiles"}, + {url: "https://github.com/moby/moby.git", name: "moby"}, + {url: "https://github.com/golang/go.git", name: "go"}, + {url: "https://github.com/torvalds/linux.git", name: "linux"}, + } + for _, benchmark := range benchmarks { + var commit *Commit + var entries Entries + if repoPath, err := setupGitRepo(benchmark.url, benchmark.name); err != nil { + b.Fatal(err) + } else if repo, err := OpenRepository(repoPath); err != nil { + b.Fatal(err) + } else if commit, err = repo.GetBranchCommit("master"); err != nil { + b.Fatal(err) + } else if entries, err = commit.Tree.ListEntries(); err != nil { + b.Fatal(err) + } + entries.Sort() + b.ResetTimer() + b.Run(benchmark.name, func(b *testing.B) { + for i := 0; i < b.N; i++ { + _, err := entries.GetCommitsInfo(commit, "") + if err != nil { + b.Fatal(err) + } + } + }) + } +} diff --git a/tree_entry.go b/tree_entry.go index d5730a0d4..41023010c 100644 --- a/tree_entry.go +++ b/tree_entry.go @@ -5,10 +5,6 @@ package git import ( - "fmt" - "path" - "path/filepath" - "runtime" "sort" "strconv" "strings" @@ -162,113 +158,3 @@ func (tes Entries) Sort() { func (tes Entries) CustomSort(cmp func(s1, s2 string) bool) { sort.Sort(customSortableEntries{cmp, tes}) } - -type commitInfo struct { - entryName string - infos []interface{} - err error -} - -// GetCommitsInfo takes advantages of concurrency to speed up getting information -// of all commits that are corresponding to these entries. This method will automatically -// choose the right number of goroutine (concurrency) to use related of the host CPU. -func (tes Entries) GetCommitsInfo(commit *Commit, treePath string) ([][]interface{}, error) { - return tes.GetCommitsInfoWithCustomConcurrency(commit, treePath, 0) -} - -// GetCommitsInfoWithCustomConcurrency takes advantages of concurrency to speed up getting information -// of all commits that are corresponding to these entries. If the given maxConcurrency is negative or -// equal to zero: the right number of goroutine (concurrency) to use will be chosen related of the -// host CPU. -func (tes Entries) GetCommitsInfoWithCustomConcurrency(commit *Commit, treePath string, maxConcurrency int) ([][]interface{}, error) { - if len(tes) == 0 { - return nil, nil - } - - if maxConcurrency <= 0 { - maxConcurrency = runtime.NumCPU() - } - - // Length of taskChan determines how many goroutines (subprocesses) can run at the same time. - // The length of revChan should be same as taskChan so goroutines whoever finished job can - // exit as early as possible, only store data inside channel. - taskChan := make(chan bool, maxConcurrency) - revChan := make(chan commitInfo, maxConcurrency) - doneChan := make(chan error) - - // Receive loop will exit when it collects same number of data pieces as tree entries. - // It notifies doneChan before exits or notify early with possible error. - infoMap := make(map[string][]interface{}, len(tes)) - go func() { - i := 0 - for info := range revChan { - if info.err != nil { - doneChan <- info.err - return - } - - infoMap[info.entryName] = info.infos - i++ - if i == len(tes) { - break - } - } - doneChan <- nil - }() - - for i := range tes { - // When taskChan is idle (or has empty slots), put operation will not block. - // However when taskChan is full, code will block and wait any running goroutines to finish. - taskChan <- true - - if tes[i].Type != ObjectCommit { - go func(i int) { - cinfo := commitInfo{entryName: tes[i].Name()} - c, err := commit.GetCommitByPath(filepath.Join(treePath, tes[i].Name())) - if err != nil { - cinfo.err = fmt.Errorf("GetCommitByPath (%s/%s): %v", treePath, tes[i].Name(), err) - } else { - cinfo.infos = []interface{}{tes[i], c} - } - revChan <- cinfo - <-taskChan // Clear one slot from taskChan to allow new goroutines to start. - }(i) - continue - } - - // Handle submodule - go func(i int) { - cinfo := commitInfo{entryName: tes[i].Name()} - sm, err := commit.GetSubModule(path.Join(treePath, tes[i].Name())) - if err != nil && !IsErrNotExist(err) { - cinfo.err = fmt.Errorf("GetSubModule (%s/%s): %v", treePath, tes[i].Name(), err) - revChan <- cinfo - return - } - - smURL := "" - if sm != nil { - smURL = sm.URL - } - - c, err := commit.GetCommitByPath(filepath.Join(treePath, tes[i].Name())) - if err != nil { - cinfo.err = fmt.Errorf("GetCommitByPath (%s/%s): %v", treePath, tes[i].Name(), err) - } else { - cinfo.infos = []interface{}{tes[i], NewSubModuleFile(c, smURL, tes[i].ID.String())} - } - revChan <- cinfo - <-taskChan - }(i) - } - - if err := <-doneChan; err != nil { - return nil, err - } - - commitsInfo := make([][]interface{}, len(tes)) - for i := 0; i < len(tes); i++ { - commitsInfo[i] = infoMap[tes[i].Name()] - } - return commitsInfo, nil -} diff --git a/tree_entry_test.go b/tree_entry_test.go index a107440ab..9a79d8e68 100644 --- a/tree_entry_test.go +++ b/tree_entry_test.go @@ -5,65 +5,11 @@ package git import ( - "os" - "path/filepath" "testing" - "time" "github.com/stretchr/testify/assert" ) -const benchmarkReposDir = "benchmark/repos/" - -func setupGitRepo(url string, name string) (string, error) { - repoDir := filepath.Join(benchmarkReposDir, name) - if _, err := os.Stat(repoDir); err == nil { - return repoDir, nil - } - return repoDir, Clone(url, repoDir, CloneRepoOptions{ - Mirror: false, - Bare: false, - Quiet: true, - Timeout: 5 * time.Minute, - }) -} - -func BenchmarkEntries_GetCommitsInfo(b *testing.B) { - benchmarks := []struct { - url string - name string - }{ - {url: "https://github.com/go-gitea/gitea.git", name: "gitea"}, - {url: "https://github.com/ethantkoenig/manyfiles.git", name: "manyfiles"}, - {url: "https://github.com/moby/moby.git", name: "moby"}, - {url: "https://github.com/golang/go.git", name: "go"}, - {url: "https://github.com/torvalds/linux.git", name: "linux"}, - } - for _, benchmark := range benchmarks { - var commit *Commit - var entries Entries - if repoPath, err := setupGitRepo(benchmark.url, benchmark.name); err != nil { - b.Fatal(err) - } else if repo, err := OpenRepository(repoPath); err != nil { - b.Fatal(err) - } else if commit, err = repo.GetBranchCommit("master"); err != nil { - b.Fatal(err) - } else if entries, err = commit.Tree.ListEntries(); err != nil { - b.Fatal(err) - } - entries.Sort() - b.ResetTimer() - b.Run(benchmark.name, func(b *testing.B) { - for i := 0; i < b.N; i++ { - _, err := entries.GetCommitsInfo(commit, "") - if err != nil { - b.Fatal(err) - } - } - }) - } -} - func getTestEntries() Entries { return Entries{ &TreeEntry{name: "v1.0", mode: EntryModeTree}, From e39b29f3921453f907e6a78dc0f67acc8411dfaf Mon Sep 17 00:00:00 2001 From: Ethan Koenig Date: Wed, 22 Nov 2017 23:13:03 -0800 Subject: [PATCH 2/4] Fix copy/rename bug --- commit_info.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/commit_info.go b/commit_info.go index c01e22ef6..96196c5b0 100644 --- a/commit_info.go +++ b/commit_info.go @@ -248,7 +248,9 @@ func (state *getCommitsInfoState) processGitLogOutput(scanner *bufio.Scanner) er continue } if line[0] >= 'A' && line[0] <= 'X' { // a file was changed by the current commit - tabIndex := strings.IndexByte(line, '\t') + // look for the last tab, since for copies (C) and renames (R) two + // filenames are printed: src, then dest + tabIndex := strings.LastIndexByte(line, '\t') if tabIndex < 1 { return fmt.Errorf("misformatted line: %s", line) } From 2706f5e7f6ef53c1e75cd454672e79540e42a869 Mon Sep 17 00:00:00 2001 From: Ethan Koenig Date: Wed, 22 Nov 2017 23:19:01 -0800 Subject: [PATCH 3/4] Comment --- commit_info.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/commit_info.go b/commit_info.go index 96196c5b0..b8b0683e5 100644 --- a/commit_info.go +++ b/commit_info.go @@ -52,6 +52,8 @@ func (state *getCommitsInfoState) numRemainingEntries() int { return numRemaining } +// getTargetEntryPath Returns the next path for a targeted-searching thread to +// search for, or returns the empty string if nothing left to search for func (state *getCommitsInfoState) getTargetedEntryPath() string { var targetedEntryPath string state.lock.Lock() From acb3e0f922b8319954e39742ae02a34d1ab50821 Mon Sep 17 00:00:00 2001 From: Ethan Koenig Date: Sat, 9 Dec 2017 17:01:53 -0800 Subject: [PATCH 4/4] Use defer --- commit_info.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/commit_info.go b/commit_info.go index b8b0683e5..77fe53bdd 100644 --- a/commit_info.go +++ b/commit_info.go @@ -47,9 +47,8 @@ type getCommitsInfoState struct { func (state *getCommitsInfoState) numRemainingEntries() int { state.lock.Lock() - numRemaining := len(state.entries) - len(state.commits) - state.lock.Unlock() - return numRemaining + defer state.lock.Unlock() + return len(state.entries) - len(state.commits) } // getTargetEntryPath Returns the next path for a targeted-searching thread to @@ -57,6 +56,7 @@ func (state *getCommitsInfoState) numRemainingEntries() int { func (state *getCommitsInfoState) getTargetedEntryPath() string { var targetedEntryPath string state.lock.Lock() + defer state.lock.Unlock() for _, entry := range state.entries { entryPath := path.Join(state.treePath, entry.Name()) if _, ok := state.commits[entryPath]; ok { @@ -68,7 +68,6 @@ func (state *getCommitsInfoState) getTargetedEntryPath() string { state.targetedPaths[entryPath] = struct{}{} break } - state.lock.Unlock() return targetedEntryPath } @@ -179,11 +178,11 @@ func (state *getCommitsInfoState) update(entryPath string, commit *Commit) bool var updated bool state.lock.Lock() + defer state.lock.Unlock() if _, ok := state.commits[entryPath]; !ok { state.commits[entryPath] = commit updated = true } - state.lock.Unlock() return updated }