Skip to content

Commit

Permalink
Merge pull request #8103 from dolthub/nicktobey/json-remove
Browse files Browse the repository at this point in the history
Optimize JSON_REMOVE on `IndexedJsonDocument`
  • Loading branch information
nicktobey committed Jul 8, 2024
2 parents a43c7b4 + 716d65d commit 094ba06
Show file tree
Hide file tree
Showing 5 changed files with 179 additions and 26 deletions.
2 changes: 1 addition & 1 deletion go/store/prolly/tree/json_chunker.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ func (j *JsonChunker) Done(ctx context.Context) (Node, error) {
// When inserting into the beginning of an object or array, we need to add an extra comma.
// We could track then in the chunker, but it's easier to just check the next part of JSON to determine
// whether we need the comma.
if jsonBytes[0] != '}' && jsonBytes[0] != ']' && jsonBytes[0] != ',' {
if j.jScanner.currentPath.getScannerState() == endOfValue && jsonBytes[0] != '}' && jsonBytes[0] != ']' && jsonBytes[0] != ',' {
j.appendJsonToBuffer([]byte(","))
}
// Append the rest of the JsonCursor, then continue until we either exhaust the cursor, or we coincide with a boundary from the original tree.
Expand Down
40 changes: 27 additions & 13 deletions go/store/prolly/tree/json_cursor.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,25 +49,28 @@ func getPreviousKey(ctx context.Context, cur *cursor) ([]byte, error) {
}

// newJsonCursor takes the root node of a prolly tree representing a JSON document, and creates a new JsonCursor for reading
// JSON, starting at the specified location in the document.
func newJsonCursor(ctx context.Context, ns NodeStore, root Node, startKey jsonLocation) (*JsonCursor, error) {
// JSON, starting at the specified location in the document. Returns a boolean indicating whether the location already exists
// in the document. If the location does not exist in the document, the resulting JsonCursor
// will be at the location where the value would be if it was inserted.
func newJsonCursor(ctx context.Context, ns NodeStore, root Node, startKey jsonLocation, forRemoval bool) (jCur *JsonCursor, found bool, err error) {
cur, err := newCursorAtKey(ctx, ns, root, startKey.key, jsonLocationOrdering{})
if err != nil {
return nil, err
return nil, false, err
}
previousKey, err := getPreviousKey(ctx, cur)
if err != nil {
return nil, err
return nil, false, err
}
jsonBytes := cur.currentValue()
jsonDecoder := ScanJsonFromMiddleWithKey(jsonBytes, previousKey)

jcur := JsonCursor{cur: cur, jsonScanner: jsonDecoder}
err = jcur.AdvanceToLocation(ctx, startKey)

found, err = jcur.AdvanceToLocation(ctx, startKey, forRemoval)
if err != nil {
return nil, err
return nil, found, err
}
return &jcur, nil
return &jcur, found, nil
}

func (j JsonCursor) Valid() bool {
Expand Down Expand Up @@ -122,20 +125,26 @@ func (j *JsonCursor) isKeyInChunk(path jsonLocation) bool {
return compareJsonLocations(path, nodeEndPosition) <= 0
}

func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation) error {
// AdvanceToLocation causes the cursor to advance to the specified position. This function returns a boolean indicating
// whether the position already exists. If it doesn't, the cursor stops at the location where the value would be if it
// were inserted.
// The `forRemoval` parameter changes the behavior when advancing to the start of an object key. When this parameter is true,
// the cursor advances to the end of the previous value, prior to the object key. This allows the key to be removed along
// with the value.
func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation, forRemoval bool) (found bool, err error) {
if !j.isKeyInChunk(path) {
// Our destination is in another chunk, load it.
err := Seek(ctx, j.cur.parent, path.key, jsonLocationOrdering{})
if err != nil {
return err
return false, err
}
j.cur.nd, err = fetchChild(ctx, j.cur.nrw, j.cur.parent.currentRef())
if err != nil {
return err
return false, err
}
previousKey, err := getPreviousKey(ctx, j.cur)
if err != nil {
return err
return false, err
}
j.jsonScanner = ScanJsonFromMiddleWithKey(j.cur.currentValue(), previousKey)
}
Expand All @@ -150,16 +159,21 @@ func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation) e
// there is no path greater than the end-of-document path, which is always the last key.
panic("Reached the end of the JSON document while advancing. This should not be possible. Is the document corrupt?")
} else if err != nil {
return err
return false, err
}
cmp = compareJsonLocations(j.jsonScanner.currentPath, path)
}
// If the supplied path doesn't exist in the document, then we want to stop the cursor at the start of the point
// were it would appear. This may mean that we've gone too far and need to rewind one location.
if cmp > 0 {
j.jsonScanner = previousScanner
return false, nil
}
// If the element is going to be removed, rewind the scanner one location, to before the key of the removed object.
if forRemoval {
j.jsonScanner = previousScanner
}
return nil
return true, nil
}

func (j *JsonCursor) AdvanceToNextLocation(ctx context.Context) (crossedBoundary bool, err error) {
Expand Down
88 changes: 76 additions & 12 deletions go/store/prolly/tree/json_indexed_document.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,16 @@ func (i IndexedJsonDocument) tryLookup(ctx context.Context, pathString string) (
return nil, err
}

jCur, err := newJsonCursor(ctx, i.m.NodeStore, i.m.Root, path)
return i.lookupByLocation(ctx, path)
}

func (i IndexedJsonDocument) lookupByLocation(ctx context.Context, path jsonLocation) (sql.JSONWrapper, error) {
jCur, found, err := newJsonCursor(ctx, i.m.NodeStore, i.m.Root, path, false)
if err != nil {
return nil, err
}

cursorPath := jCur.GetCurrentPath()
cmp := compareJsonLocations(cursorPath, path)
if cmp != 0 {
if !found {
// The key doesn't exist in the document.
return nil, nil
}
Expand Down Expand Up @@ -191,18 +193,22 @@ func (i IndexedJsonDocument) tryInsert(ctx context.Context, path string, val sql
return nil, false, err
}

jsonCursor, err := newJsonCursor(ctx, i.m.NodeStore, i.m.Root, keyPath)
jsonCursor, found, err := newJsonCursor(ctx, i.m.NodeStore, i.m.Root, keyPath, false)
if err != nil {
return nil, false, err
}

cursorPath := jsonCursor.GetCurrentPath()
cmp := compareJsonLocations(cursorPath, keyPath)
if cmp == 0 {
if found {
// The key already exists in the document.
return i, false, nil
}

return i.insertIntoCursor(ctx, keyPath, jsonCursor, val)
}

func (i IndexedJsonDocument) insertIntoCursor(ctx context.Context, keyPath jsonLocation, jsonCursor *JsonCursor, val sql.JSONWrapper) (types.MutableJSON, bool, error) {
cursorPath := jsonCursor.GetCurrentPath()

// If the inserted path is equivalent to "$" (which also includes "$[0]" on non-arrays), do nothing.
if cursorPath.size() == 0 && cursorPath.getScannerState() == startOfValue {
return i, false, nil
Expand Down Expand Up @@ -272,6 +278,7 @@ func (i IndexedJsonDocument) tryInsert(ctx context.Context, path string, val sql
// For example, attempting to insert into the path "$.a.b" in the document {"a": 1}
// We can detect this by checking to see if the insertion point in the original document comes before the inserted path.
// (For example, the insertion point occurs at $.a.START, which is before $.a.b)
cmp := compareJsonLocations(cursorPath, keyPath)
if cmp < 0 && cursorPath.getScannerState() == startOfValue {
// We just attempted to insert into a scalar.
return i, false, nil
Expand Down Expand Up @@ -313,13 +320,70 @@ func (i IndexedJsonDocument) tryInsert(ctx context.Context, path string, val sql
return NewIndexedJsonDocument(ctx, newRoot, i.m.NodeStore), true, nil
}

// Remove is not yet implemented, so we call it on a types.JSONDocument instead.
func (i IndexedJsonDocument) Remove(ctx context.Context, path string) (types.MutableJSON, bool, error) {
v, err := i.ToInterface()
// Remove implements types.MutableJSON
func (i IndexedJsonDocument) Remove(ctx context.Context, path string) (result types.MutableJSON, changed bool, err error) {
if path == "$" {
return nil, false, fmt.Errorf("The path expression '$' is not allowed in this context.")
}
err = tryWithFallback(
ctx,
i,
func() error {
result, changed, err = i.tryRemove(ctx, path)
return err
},
func(jsonDocument types.JSONDocument) error {
result, changed, err = jsonDocument.Remove(ctx, path)
return err
})
return result, changed, err
}

func (i IndexedJsonDocument) tryRemove(ctx context.Context, path string) (types.MutableJSON, bool, error) {
keyPath, err := jsonPathElementsFromMySQLJsonPath([]byte(path))
if err != nil {
return nil, false, err
}
return types.JSONDocument{Val: v}.Remove(ctx, path)

jsonCursor, found, err := newJsonCursor(ctx, i.m.NodeStore, i.m.Root, keyPath, true)
if err != nil {
return nil, false, err
}
if !found {
// The key does not exist in the document.
return i, false, nil
}

// The cursor is now pointing to the end of the value prior to the one being removed.
jsonChunker, err := newJsonChunker(ctx, jsonCursor, i.m.NodeStore)
if err != nil {
return nil, false, err
}

startofRemovedLocation := jsonCursor.GetCurrentPath()
startofRemovedLocation = startofRemovedLocation.Clone()
isInitialElement := startofRemovedLocation.getScannerState().isInitialElement()

// Advance the cursor to the end of the value being removed.
keyPath.setScannerState(endOfValue)
_, err = jsonCursor.AdvanceToLocation(ctx, keyPath, false)
if err != nil {
return nil, false, err
}

// If removing the first element of an object/array, skip past the comma, and set the chunker as if it's
// at the start of the object/array.
if isInitialElement && jsonCursor.jsonScanner.current() == ',' {
jsonCursor.jsonScanner.valueOffset++
jsonChunker.jScanner.currentPath = startofRemovedLocation
}

newRoot, err := jsonChunker.Done(ctx)
if err != nil {
return nil, false, err
}

return NewIndexedJsonDocument(ctx, newRoot, i.m.NodeStore), true, nil
}

// Set is not yet implemented, so we call it on a types.JSONDocument instead.
Expand Down
71 changes: 71 additions & 0 deletions go/store/prolly/tree/json_indexed_document_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package tree
import (
"context"
"fmt"
"slices"
"strings"
"testing"

Expand Down Expand Up @@ -177,6 +178,76 @@ func TestIndexedJsonDocument_Insert(t *testing.T) {

}

func TestIndexedJsonDocument_Remove(t *testing.T) {
ctx := sql.NewEmptyContext()
ns := NewTestNodeStore()
convertToIndexedJsonDocument := func(t *testing.T, s interface{}) interface{} {
return newIndexedJsonDocumentFromValue(t, ctx, ns, s)
}

testCases := jsontests.JsonRemoveTestCases(t, convertToIndexedJsonDocument)
jsontests.RunJsonTests(t, testCases)

t.Run("large document removals", func(t *testing.T) {

largeDoc := createLargeDocumentForTesting(t, ctx, ns)

for _, chunkBoundary := range largeDocumentChunkBoundaries {
t.Run(jsonPathTypeNames[chunkBoundary.pathType], func(t *testing.T) {
// For each tested chunk boundary, get the path to the object containing that crosses that boundary, and remove it.
removalPointString := chunkBoundary.path[:strings.LastIndex(chunkBoundary.path, ".")]

removalPointLocation, err := jsonPathElementsFromMySQLJsonPath([]byte(removalPointString))
require.NoError(t, err)

// Test that the value exists prior to removal
result, err := largeDoc.Lookup(ctx, removalPointString)
require.NoError(t, err)
require.NotNil(t, result)

newDoc, changed, err := largeDoc.Remove(ctx, removalPointString)
require.NoError(t, err)
require.True(t, changed)

// test that new value is valid by calling ToInterface
v, err := newDoc.ToInterface()
require.NoError(t, err)

// If the removed value was an object key, check that the key no longer exists.
// If the removed value was an array element, check that the parent array has shrunk.
if removalPointLocation.getLastPathElement().isArrayIndex {
// Check that the parent array has shrunk.
arrayIndex := int(removalPointLocation.getLastPathElement().getArrayIndex())
parentLocation := removalPointLocation.Clone()
parentLocation.pop()

getParentArray := func(doc IndexedJsonDocument) []interface{} {
arrayWrapper, err := doc.lookupByLocation(ctx, parentLocation)
require.NoError(t, err)
arrayInterface, err := arrayWrapper.ToInterface()
require.NoError(t, err)
require.IsType(t, []interface{}{}, arrayInterface)
return arrayInterface.([]interface{})
}

oldArray := getParentArray(largeDoc)
newArray := getParentArray(newDoc.(IndexedJsonDocument))

oldArray = slices.Delete(oldArray, arrayIndex, arrayIndex+1)

require.Equal(t, oldArray, newArray)
} else {
// Check that the key no longer exists.
newJsonDocument := types.JSONDocument{Val: v}
result, err = newJsonDocument.Lookup(ctx, removalPointString)
require.NoError(t, err)
require.Nil(t, result)
}
})
}
})
}

func TestIndexedJsonDocument_Extract(t *testing.T) {
ctx := context.Background()
ns := NewTestNodeStore()
Expand Down
4 changes: 4 additions & 0 deletions go/store/prolly/tree/json_location.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ const (
endOfValue
)

func (t jsonPathType) isInitialElement() bool {
return t == objectInitialElement || t == arrayInitialElement
}

var unknownLocationKeyError = fmt.Errorf("A JSON document was written with a future version of Dolt, and the index metadata cannot be read. This will impact performance for large documents.")
var unsupportedPathError = fmt.Errorf("The supplied JSON path is not supported for optimized lookup, falling back on unoptimized implementation.")

Expand Down

0 comments on commit 094ba06

Please sign in to comment.