Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Proof of concept: Experimental support for git commit graph files #6701

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions models/repo.go
Original file line number Diff line number Diff line change
Expand Up @@ -2240,6 +2240,14 @@ func GitFsck() {
func(idx int, bean interface{}) error {
repo := bean.(*Repository)
repoPath := repo.RepoPath()
// TODO: Move this elsewhere
if gitRepo, err := git.OpenRepository(repoPath); err == nil {
log.Trace("Building commit graph index")
if err := gitRepo.BuildCommitGraph(false); err != nil {
desc := fmt.Sprintf("Failed to build commit graph (%s): %v", repoPath, err)
log.Warn(desc)
}
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is inserted here only to allow people to play with the feature.

Running a health check for all the repositories will also rebuild the commit graph files (http://gitea/admin?op=9). It is entirely possible to generate the commit graph file using the command line git commit-graph write tool instead. The bloom filter experiment is enabled by changing BuildCommitGraph(false) to BuildCommitGraph(true) in the above code. It will significantly increase the size of the commit graph files and the time to build it, but in many cases it will also significantly speed up hhistory queries on large repositories (unless I broke it :D).

log.Trace("Running health check on repository %s", repoPath)
if err := git.Fsck(repoPath, setting.Cron.RepoHealthCheck.Timeout, setting.Cron.RepoHealthCheck.Args...); err != nil {
desc := fmt.Sprintf("Failed to health check repository (%s): %v", repoPath, err)
Expand Down
92 changes: 92 additions & 0 deletions modules/commitgraph/plumbing/format/commitgraph/bloom.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
package commitgraph

import (
"encoding/binary"
"hash"
"hash/fnv"

"github.com/dchest/siphash"
)

type filter struct {
m uint32
k uint32
h hash.Hash64
}

func (f *filter) bits(data []byte) []uint32 {
f.h.Reset()
f.h.Write(data)
d := f.h.Sum(nil)
a := binary.BigEndian.Uint32(d[4:8])
b := binary.BigEndian.Uint32(d[0:4])
is := make([]uint32, f.k)
for i := uint32(0); i < f.k; i++ {
is[i] = (a + b*i) % f.m
}
return is
}

func newFilter(m, k uint32) *filter {
return &filter{
m: m,
k: k,
h: fnv.New64(),
}
}

// BloomPathFilter is a probabilistic data structure that helps determining
// whether a path was was changed.
//
// The implementation uses a standard bloom filter with n=512, m=10, k=7
// parameters using the 64-bit SipHash hash function with zero key.
type BloomPathFilter struct {
b []byte
}

// Test checks whether a path was previously added to the filter. Returns
// false if the path is not present in the filter. Returns true if the path
// could be present in the filter.
func (f *BloomPathFilter) Test(path string) bool {
d := siphash.Hash(0, 0, []byte(path))
a := uint32(d)
b := uint32(d >> 32)
var i uint32
for i = 0; i < 7; i++ {
bit := (a + b*i) % 5120
if f.b[bit>>3]&(1<<(bit&7)) == 0 {
return false
}
}
return true
}

// Add path data to the filter.
func (f *BloomPathFilter) Add(path string) {
d := siphash.Hash(0, 0, []byte(path))
a := uint32(d)
b := uint32(d >> 32)
var i uint32
for i = 0; i < 7; i++ {
bit := (a + b*i) % 5120
f.b[bit>>3] |= 1 << (bit & 7)
}
}

// Data returns data bytes
func (f *BloomPathFilter) Data() []byte {
return f.b
}

// NewBloomPathFilter creates a new empty bloom filter
func NewBloomPathFilter() *BloomPathFilter {
f := &BloomPathFilter{make([]byte, 640)}
return f
}

// LoadBloomPathFilter creates a bloom filter from a byte array previously
// returned by Data
func LoadBloomPathFilter(data []byte) *BloomPathFilter {
f := &BloomPathFilter{data}
return f
}
38 changes: 38 additions & 0 deletions modules/commitgraph/plumbing/format/commitgraph/commitgraph.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package commitgraph

import (
"time"

"gopkg.in/src-d/go-git.v4/plumbing"
)

// Node is a reduced representation of Commit as presented in the commit graph
// file. It is merely useful as an optimization for walking the commit graphs.
type Node struct {
// TreeHash is the hash of the root tree of the commit.
TreeHash plumbing.Hash
// ParentIndexes are the indexes of the parent commits of the commit.
ParentIndexes []int
// ParentHashes are the hashes of the parent commits of the commit.
ParentHashes []plumbing.Hash
// Generation number is the pre-computed generation in the commit graph
// or zero if not available
Generation int
// When is the timestamp of the commit.
When time.Time
}

// Index represents a representation of commit graph that allows indexed
// access to the nodes using commit object hash
type Index interface {
// GetIndexByHash gets the index in the commit graph from commit hash, if available
GetIndexByHash(h plumbing.Hash) (int, error)
// GetNodeByIndex gets the commit node from the commit graph using index
// obtained from child node, if available
GetNodeByIndex(i int) (*Node, error)
// Hashes returns all the hashes that are available in the index
Hashes() []plumbing.Hash

// GetBloomFilterByIndex gets the bloom filter for files changed in the commit, if available
GetBloomFilterByIndex(i int) (*BloomPathFilter, error)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package commitgraph_test

import (
"testing"

"code.gitea.io/gitea/modules/commitgraph/plumbing/format/commitgraph"
"golang.org/x/exp/mmap"

. "gopkg.in/check.v1"
"gopkg.in/src-d/go-git-fixtures.v3"
"gopkg.in/src-d/go-git.v4/plumbing"
)

func Test(t *testing.T) { TestingT(t) }

type CommitgraphSuite struct {
fixtures.Suite
}

var _ = Suite(&CommitgraphSuite{})

func (s *CommitgraphSuite) TestDecode(c *C) {
reader, err := mmap.Open("..\\..\\tests\\testgit\\objects\\info\\commit-graph")
c.Assert(err, IsNil)
index, err := commitgraph.OpenFileIndex(reader)
c.Assert(err, IsNil)

nodeIndex, err := index.GetIndexByHash(plumbing.NewHash("5aa811d3c2f6d5d6e928a4acacd15248928c26d0"))
c.Assert(err, IsNil)
node, err := index.GetNodeByIndex(nodeIndex)
c.Assert(err, IsNil)
c.Assert(len(node.ParentIndexes), Equals, 0)

reader.Close()
}
197 changes: 197 additions & 0 deletions modules/commitgraph/plumbing/format/commitgraph/encoder.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
package commitgraph

import (
"bytes"
"crypto/sha1"
"hash"
"io"
"math"

"gopkg.in/src-d/go-git.v4/plumbing"
"gopkg.in/src-d/go-git.v4/utils/binary"
)

// Encoder writes MemoryIndex structs to an output stream.
type Encoder struct {
io.Writer
hash hash.Hash
}

// NewEncoder returns a new stream encoder that writes to w.
func NewEncoder(w io.Writer) *Encoder {
h := sha1.New()
mw := io.MultiWriter(w, h)
return &Encoder{mw, h}
}

func (e *Encoder) Encode(idx Index) error {
// Get all the hashes in the memory index
hashes := idx.Hashes()

// Sort the hashes and build our index
plumbing.HashesSort(hashes)
hashToIndex := make(map[plumbing.Hash]uint32)
hashFirstToCount := make(map[byte]uint32)
for i, hash := range hashes {
hashToIndex[hash] = uint32(i)
hashFirstToCount[hash[0]]++
}

// Find out if we will need large edge table
chunkCount := 3
hasLargeEdges := false
for i := 0; i < len(hashes); i++ {
v, _ := idx.GetNodeByIndex(i)
if len(v.ParentHashes) > 2 {
hasLargeEdges = true
chunkCount++
break
}
}

// Find out if the bloom filters are present
hasBloomFilters := false
sparseBloomFilters := false
bloomFiltersCount := 0
for i := 0; i < len(hashes); i++ {
_, err := idx.GetBloomFilterByIndex(i)
if err == nil {
bloomFiltersCount++
}
}
if bloomFiltersCount > 0 {
hasBloomFilters = true
chunkCount++
if bloomFiltersCount < (len(hashes) * 4 / 3) {
sparseBloomFilters = true
chunkCount++
}
}

var fanoutOffset = uint64(20 + (chunkCount * 12))
var oidLookupOffset = fanoutOffset + 4*256
var commitDataOffset = oidLookupOffset + uint64(len(hashes))*20
var bloomOffset = commitDataOffset + uint64(len(hashes))*36
var sparseBloomOffset = bloomOffset + uint64(bloomFiltersCount)*640
var largeEdgeListOffset uint64
var largeEdges []uint32

// Write header
// TODO: Error handling
e.Write(commitFileSignature)
e.Write([]byte{1, 1, byte(chunkCount), 0})

// Write chunk headers
e.Write(oidFanoutSignature)
binary.WriteUint64(e, fanoutOffset)
e.Write(oidLookupSignature)
binary.WriteUint64(e, oidLookupOffset)
e.Write(commitDataSignature)
binary.WriteUint64(e, commitDataOffset)
if hasBloomFilters {
e.Write(experimentalBloomSignature)
binary.WriteUint64(e, bloomOffset)
if sparseBloomFilters {
e.Write(experimentalSparseBloomSignature)
binary.WriteUint64(e, sparseBloomOffset)
largeEdgeListOffset = sparseBloomOffset + uint64(len(hashes)+7)/8
} else {
largeEdgeListOffset = bloomOffset + 640*uint64(len(hashes))
}
}
if hasLargeEdges {
e.Write(largeEdgeListSignature)
binary.WriteUint64(e, largeEdgeListOffset)
}
e.Write([]byte{0, 0, 0, 0})
binary.WriteUint64(e, uint64(0))

// Write fanout
var cumulative uint32
for i := 0; i <= 0xff; i++ {
if err := binary.WriteUint32(e, hashFirstToCount[byte(i)]+cumulative); err != nil {
return err
}
cumulative += hashFirstToCount[byte(i)]
}

// Write OID lookup
for _, hash := range hashes {
if _, err := e.Write(hash[:]); err != nil {
return err
}
}

// Write commit data
for _, hash := range hashes {
origIndex, _ := idx.GetIndexByHash(hash)
commitData, _ := idx.GetNodeByIndex(origIndex)
if _, err := e.Write(commitData.TreeHash[:]); err != nil {
return err
}

if len(commitData.ParentHashes) == 0 {
binary.WriteUint32(e, parentNone)
binary.WriteUint32(e, parentNone)
} else if len(commitData.ParentHashes) == 1 {
binary.WriteUint32(e, hashToIndex[commitData.ParentHashes[0]])
binary.WriteUint32(e, parentNone)
} else if len(commitData.ParentHashes) == 2 {
binary.WriteUint32(e, hashToIndex[commitData.ParentHashes[0]])
binary.WriteUint32(e, hashToIndex[commitData.ParentHashes[1]])
} else if len(commitData.ParentHashes) > 2 {
binary.WriteUint32(e, hashToIndex[commitData.ParentHashes[0]])
binary.WriteUint32(e, uint32(len(largeEdges))|parentOctopusMask)
for _, parentHash := range commitData.ParentHashes[1:] {
largeEdges = append(largeEdges, hashToIndex[parentHash])
}
largeEdges[len(largeEdges)-1] |= parentLast
}

unixTime := uint64(commitData.When.Unix())
unixTime |= uint64(commitData.Generation) << 34
binary.WriteUint64(e, unixTime)
}

// Write bloom filters (experimental)
if hasBloomFilters {
var sparseBloomBitset []byte

if sparseBloomFilters {
sparseBloomBitset = bytes.Repeat([]byte{0xff}, (len(hashes)+7)/8)
}

for i, hash := range hashes {
origIndex, _ := idx.GetIndexByHash(hash)
if bloomFilter, err := idx.GetBloomFilterByIndex(origIndex); err != nil {
if !sparseBloomFilters {
for i := 0; i < 80; i++ {
binary.WriteUint64(e, math.MaxUint64)
}
} else {
sparseBloomBitset[i/8] &= ^(1 << uint(i%8))
}
} else {
e.Write(bloomFilter.Data())
}
}

if sparseBloomFilters {
e.Write(sparseBloomBitset)
}
}

// Write large edges if necessary
if hasLargeEdges {
for _, parent := range largeEdges {
binary.WriteUint32(e, parent)
}
}

// Write checksum
if _, err := e.Write(e.hash.Sum(nil)[:20]); err != nil {
return err
}

return nil
}
Loading