Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor code indexer #9313

Merged
merged 10 commits into from
Dec 23, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
529 changes: 247 additions & 282 deletions modules/indexer/code/bleve.go

Large diffs are not rendered by default.

54 changes: 54 additions & 0 deletions modules/indexer/code/bleve_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,66 @@
package code

import (
"os"
"path/filepath"
"testing"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"

"github.com/stretchr/testify/assert"
)

func TestMain(m *testing.M) {
models.MainTest(m, filepath.Join("..", "..", ".."))
}

func TestIndexAndSearch(t *testing.T) {
models.PrepareTestEnv(t)

dir := "./bleve.index"
os.RemoveAll(dir)

setting.Indexer.RepoIndexerEnabled = true
idx, _, err := NewBleveIndexer(dir)
if err != nil {
idx.Close()
log.Fatal("indexer.Init: %v", err)
}

err = idx.Index(1)
assert.NoError(t, err)

var (
keywords = []struct {
Keyword string
IDs []int64
}{
{
Keyword: "Description",
IDs: []int64{1},
},
{
Keyword: "repo1",
IDs: []int64{1},
},
{
Keyword: "non-exist",
IDs: []int64{},
},
}
)

for _, kw := range keywords {
total, res, err := idx.Search(nil, kw.Keyword, 1, 10)
assert.NoError(t, err)
assert.EqualValues(t, len(kw.IDs), total)

var ids = make([]int64, 0, len(res))
for _, hit := range res {
ids = append(ids, hit.RepoID)
}
assert.EqualValues(t, kw.IDs, ids)
}
}
147 changes: 147 additions & 0 deletions modules/indexer/code/git.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package code

import (
"strconv"
"strings"

"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
)

type fileUpdate struct {
Filename string
BlobSha string
}

// repoChanges changes (file additions/updates/removals) to a repo
type repoChanges struct {
Updates []fileUpdate
RemovedFilenames []string
}

func getDefaultBranchSha(repo *models.Repository) (string, error) {
stdout, err := git.NewCommand("show-ref", "-s", git.BranchPrefix+repo.DefaultBranch).RunInDir(repo.RepoPath())
if err != nil {
return "", err
}
return strings.TrimSpace(stdout), nil
}

// getRepoChanges returns changes to repo since last indexer update
func getRepoChanges(repo *models.Repository, revision string) (*repoChanges, error) {
if err := repo.GetIndexerStatus(); err != nil {
return nil, err
}

if len(repo.IndexerStatus.CommitSha) == 0 {
return genesisChanges(repo, revision)
}
return nonGenesisChanges(repo, revision)
}

func isIndexable(entry *git.TreeEntry) bool {
if !entry.IsRegular() && !entry.IsExecutable() {
return false
}
name := strings.ToLower(entry.Name())
for _, g := range setting.Indexer.ExcludePatterns {
if g.Match(name) {
return false
}
}
for _, g := range setting.Indexer.IncludePatterns {
if g.Match(name) {
return true
}
}
return len(setting.Indexer.IncludePatterns) == 0
}

// parseGitLsTreeOutput parses the output of a `git ls-tree -r --full-name` command
func parseGitLsTreeOutput(stdout []byte) ([]fileUpdate, error) {
entries, err := git.ParseTreeEntries(stdout)
if err != nil {
return nil, err
}
var idxCount = 0
updates := make([]fileUpdate, len(entries))
for _, entry := range entries {
if isIndexable(entry) {
updates[idxCount] = fileUpdate{
Filename: entry.Name(),
BlobSha: entry.ID.String(),
}
idxCount++
}
}
return updates[:idxCount], nil
}

// genesisChanges get changes to add repo to the indexer for the first time
func genesisChanges(repo *models.Repository, revision string) (*repoChanges, error) {
var changes repoChanges
stdout, err := git.NewCommand("ls-tree", "--full-tree", "-r", revision).
RunInDirBytes(repo.RepoPath())
if err != nil {
return nil, err
}
changes.Updates, err = parseGitLsTreeOutput(stdout)
return &changes, err
}

// nonGenesisChanges get changes since the previous indexer update
func nonGenesisChanges(repo *models.Repository, revision string) (*repoChanges, error) {
diffCmd := git.NewCommand("diff", "--name-status",
repo.IndexerStatus.CommitSha, revision)
stdout, err := diffCmd.RunInDir(repo.RepoPath())
if err != nil {
// previous commit sha may have been removed by a force push, so
// try rebuilding from scratch
log.Warn("git diff: %v", err)
if err = indexer.Delete(repo.ID); err != nil {
return nil, err
}
return genesisChanges(repo, revision)
}
var changes repoChanges
updatedFilenames := make([]string, 0, 10)
for _, line := range strings.Split(stdout, "\n") {
line = strings.TrimSpace(line)
if len(line) == 0 {
continue
}
filename := strings.TrimSpace(line[1:])
if len(filename) == 0 {
continue
} else if filename[0] == '"' {
filename, err = strconv.Unquote(filename)
if err != nil {
return nil, err
}
}

switch status := line[0]; status {
case 'M', 'A':
updatedFilenames = append(updatedFilenames, filename)
case 'D':
changes.RemovedFilenames = append(changes.RemovedFilenames, filename)
default:
log.Warn("Unrecognized status: %c (line=%s)", status, line)
}
}

cmd := git.NewCommand("ls-tree", "--full-tree", revision, "--")
cmd.AddArguments(updatedFilenames...)
lsTreeStdout, err := cmd.RunInDirBytes(repo.RepoPath())
if err != nil {
return nil, err
}
changes.Updates, err = parseGitLsTreeOutput(lsTreeStdout)
return &changes, err
}
107 changes: 54 additions & 53 deletions modules/indexer/code/indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,72 +5,73 @@
package code

import (
"os"
"strconv"
"time"

"code.gitea.io/gitea/modules/graceful"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
)

"github.com/blevesearch/bleve"
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/index/upsidedown"
"github.com/blevesearch/bleve/mapping"
"github.com/blevesearch/bleve/search/query"
"github.com/ethantkoenig/rupture"
var (
indexer Indexer
)

// indexerID a bleve-compatible unique identifier for an integer id
func indexerID(id int64) string {
return strconv.FormatInt(id, 36)
// SearchResult result of performing a search in a repo
type SearchResult struct {
RepoID int64
StartIndex int
EndIndex int
Filename string
Content string
}

// numericEqualityQuery a numeric equality query for the given value and field
func numericEqualityQuery(value int64, field string) *query.NumericRangeQuery {
f := float64(value)
tru := true
q := bleve.NewNumericRangeInclusiveQuery(&f, &f, &tru, &tru)
q.SetField(field)
return q
// Indexer defines an interface to indexer issues contents
type Indexer interface {
Index(repoID int64) error
Delete(repoID int64) error
Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error)
Close()
}

const unicodeNormalizeName = "unicodeNormalize"
// Init initialize the repo indexer
func Init() {
if !setting.Indexer.RepoIndexerEnabled {
return
}

func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]interface{}{
"type": unicodenorm.Name,
"form": unicodenorm.NFC,
})
}
waitChannel := make(chan time.Duration)
go func() {
start := time.Now()
log.Info("Initializing Repository Indexer")
var created bool
var err error
indexer, created, err = NewBleveIndexer(setting.Indexer.RepoPath)
if err != nil {
indexer.Close()
log.Fatal("indexer.Init: %v", err)
}

const maxBatchSize = 16
go processRepoIndexerOperationQueue(indexer)

// openIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
func openIndexer(path string, latestVersion int) (bleve.Index, error) {
_, err := os.Stat(setting.Indexer.IssuePath)
if err != nil && os.IsNotExist(err) {
return nil, nil
} else if err != nil {
return nil, err
}
if created {
go populateRepoIndexer()
}

metadata, err := rupture.ReadIndexMetadata(path)
if err != nil {
return nil, err
}
if metadata.Version < latestVersion {
// the indexer is using a previous version, so we should delete it and
// re-populate
return nil, os.RemoveAll(path)
}
waitChannel <- time.Since(start)
}()

index, err := bleve.Open(path)
if err != nil && err == upsidedown.IncompatibleVersion {
// the indexer was built with a previous version of bleve, so we should
// delete it and re-populate
return nil, os.RemoveAll(path)
} else if err != nil {
return nil, err
if setting.Indexer.StartupTimeout > 0 {
go func() {
timeout := setting.Indexer.StartupTimeout
if graceful.GetManager().IsChild() && setting.GracefulHammerTime > 0 {
timeout += setting.GracefulHammerTime
}
select {
case duration := <-waitChannel:
log.Info("Repository Indexer Initialization took %v", duration)
case <-time.After(timeout):
log.Fatal("Repository Indexer Initialization Timed-Out after: %v", timeout)
}
}()
}
return index, nil
}
Loading