Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize JSON_REMOVE on IndexedJsonDocument #8103

Merged
merged 4 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go/store/prolly/tree/json_chunker.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ func (j *JsonChunker) Done(ctx context.Context) (Node, error) {
// When inserting into the beginning of an object or array, we need to add an extra comma.
// We could track then in the chunker, but it's easier to just check the next part of JSON to determine
// whether we need the comma.
if jsonBytes[0] != '}' && jsonBytes[0] != ']' && jsonBytes[0] != ',' {
if j.jScanner.currentPath.getScannerState() == endOfValue && jsonBytes[0] != '}' && jsonBytes[0] != ']' && jsonBytes[0] != ',' {
j.appendJsonToBuffer([]byte(","))
}
// Append the rest of the JsonCursor, then continue until we either exhaust the cursor, or we coincide with a boundary from the original tree.
Expand Down
30 changes: 18 additions & 12 deletions go/store/prolly/tree/json_cursor.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,24 +50,25 @@ func getPreviousKey(ctx context.Context, cur *cursor) ([]byte, error) {

// newJsonCursor takes the root node of a prolly tree representing a JSON document, and creates a new JsonCursor for reading
// JSON, starting at the specified location in the document.
func newJsonCursor(ctx context.Context, ns NodeStore, root Node, startKey jsonLocation) (*JsonCursor, error) {
func newJsonCursor(ctx context.Context, ns NodeStore, root Node, startKey jsonLocation, forRemoval bool) (*JsonCursor, bool, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(minor) Would be nice to give the return values names, or to document them (the boolean in particular) so that it's easier for future readers to quickly understand what the boolean return param indicates.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a comment and return variable names.

cur, err := newCursorAtKey(ctx, ns, root, startKey.key, jsonLocationOrdering{})
if err != nil {
return nil, err
return nil, false, err
}
previousKey, err := getPreviousKey(ctx, cur)
if err != nil {
return nil, err
return nil, false, err
}
jsonBytes := cur.currentValue()
jsonDecoder := ScanJsonFromMiddleWithKey(jsonBytes, previousKey)

jcur := JsonCursor{cur: cur, jsonScanner: jsonDecoder}
err = jcur.AdvanceToLocation(ctx, startKey)

found, err := jcur.AdvanceToLocation(ctx, startKey, forRemoval)
if err != nil {
return nil, err
return nil, found, err
}
return &jcur, nil
return &jcur, found, nil
}

func (j JsonCursor) Valid() bool {
Expand Down Expand Up @@ -122,20 +123,20 @@ func (j *JsonCursor) isKeyInChunk(path jsonLocation) bool {
return compareJsonLocations(path, nodeEndPosition) <= 0
}

func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation) error {
func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation, forRemoval bool) (found bool, err error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some method docs would help readers quickly understand the params without having to read the implementation; especially since this method is exposed outside the package, it would be helpful to add docs. For example, it seems important for callers to know that if the third param is passed as true, then the cursor position returned will be different. Currently, readers would have to read the implementation to figure out how to use this method correctly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a docstring.

if !j.isKeyInChunk(path) {
// Our destination is in another chunk, load it.
err := Seek(ctx, j.cur.parent, path.key, jsonLocationOrdering{})
if err != nil {
return err
return false, err
}
j.cur.nd, err = fetchChild(ctx, j.cur.nrw, j.cur.parent.currentRef())
if err != nil {
return err
return false, err
}
previousKey, err := getPreviousKey(ctx, j.cur)
if err != nil {
return err
return false, err
}
j.jsonScanner = ScanJsonFromMiddleWithKey(j.cur.currentValue(), previousKey)
}
Expand All @@ -150,16 +151,21 @@ func (j *JsonCursor) AdvanceToLocation(ctx context.Context, path jsonLocation) e
// there is no path greater than the end-of-document path, which is always the last key.
panic("Reached the end of the JSON document while advancing. This should not be possible. Is the document corrupt?")
} else if err != nil {
return err
return false, err
}
cmp = compareJsonLocations(j.jsonScanner.currentPath, path)
}
// If the supplied path doesn't exist in the document, then we want to stop the cursor at the start of the point
// were it would appear. This may mean that we've gone too far and need to rewind one location.
if cmp > 0 {
j.jsonScanner = previousScanner
return false, nil
}
// If the element is going to be removed, rewind the scanner one location, to before the key of the removed object.
if forRemoval {
j.jsonScanner = previousScanner
}
return nil
return true, nil
}

func (j *JsonCursor) AdvanceToNextLocation(ctx context.Context) (crossedBoundary bool, err error) {
Expand Down
88 changes: 76 additions & 12 deletions go/store/prolly/tree/json_indexed_document.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,16 @@ func (i IndexedJsonDocument) tryLookup(ctx context.Context, pathString string) (
return nil, err
}

jCur, err := newJsonCursor(ctx, i.m.NodeStore, i.m.Root, path)
return i.lookupByLocation(ctx, path)
}

func (i IndexedJsonDocument) lookupByLocation(ctx context.Context, path jsonLocation) (sql.JSONWrapper, error) {
jCur, found, err := newJsonCursor(ctx, i.m.NodeStore, i.m.Root, path, false)
if err != nil {
return nil, err
}

cursorPath := jCur.GetCurrentPath()
cmp := compareJsonLocations(cursorPath, path)
if cmp != 0 {
if !found {
// The key doesn't exist in the document.
return nil, nil
}
Expand Down Expand Up @@ -191,18 +193,22 @@ func (i IndexedJsonDocument) tryInsert(ctx context.Context, path string, val sql
return nil, false, err
}

jsonCursor, err := newJsonCursor(ctx, i.m.NodeStore, i.m.Root, keyPath)
jsonCursor, found, err := newJsonCursor(ctx, i.m.NodeStore, i.m.Root, keyPath, false)
if err != nil {
return nil, false, err
}

cursorPath := jsonCursor.GetCurrentPath()
cmp := compareJsonLocations(cursorPath, keyPath)
if cmp == 0 {
if found {
// The key already exists in the document.
return i, false, nil
}

return i.insertIntoCursor(ctx, keyPath, jsonCursor, val)
}

func (i IndexedJsonDocument) insertIntoCursor(ctx context.Context, keyPath jsonLocation, jsonCursor *JsonCursor, val sql.JSONWrapper) (types.MutableJSON, bool, error) {
cursorPath := jsonCursor.GetCurrentPath()

// If the inserted path is equivalent to "$" (which also includes "$[0]" on non-arrays), do nothing.
if cursorPath.size() == 0 && cursorPath.getScannerState() == startOfValue {
return i, false, nil
Expand Down Expand Up @@ -272,6 +278,7 @@ func (i IndexedJsonDocument) tryInsert(ctx context.Context, path string, val sql
// For example, attempting to insert into the path "$.a.b" in the document {"a": 1}
// We can detect this by checking to see if the insertion point in the original document comes before the inserted path.
// (For example, the insertion point occurs at $.a.START, which is before $.a.b)
cmp := compareJsonLocations(cursorPath, keyPath)
if cmp < 0 && cursorPath.getScannerState() == startOfValue {
// We just attempted to insert into a scalar.
return i, false, nil
Expand Down Expand Up @@ -313,13 +320,70 @@ func (i IndexedJsonDocument) tryInsert(ctx context.Context, path string, val sql
return NewIndexedJsonDocument(ctx, newRoot, i.m.NodeStore), true, nil
}

// Remove is not yet implemented, so we call it on a types.JSONDocument instead.
func (i IndexedJsonDocument) Remove(ctx context.Context, path string) (types.MutableJSON, bool, error) {
v, err := i.ToInterface()
// Remove implements types.MutableJSON
func (i IndexedJsonDocument) Remove(ctx context.Context, path string) (result types.MutableJSON, changed bool, err error) {
if path == "$" {
return nil, false, fmt.Errorf("The path expression '$' is not allowed in this context.")
}
err = tryWithFallback(
ctx,
i,
func() error {
result, changed, err = i.tryRemove(ctx, path)
return err
},
func(jsonDocument types.JSONDocument) error {
result, changed, err = jsonDocument.Remove(ctx, path)
return err
})
return result, changed, err
}

func (i IndexedJsonDocument) tryRemove(ctx context.Context, path string) (types.MutableJSON, bool, error) {
keyPath, err := jsonPathElementsFromMySQLJsonPath([]byte(path))
if err != nil {
return nil, false, err
}
return types.JSONDocument{Val: v}.Remove(ctx, path)

jsonCursor, found, err := newJsonCursor(ctx, i.m.NodeStore, i.m.Root, keyPath, true)
if err != nil {
return nil, false, err
}
if !found {
// The key does not exist in the document.
return i, false, nil
}

// The cursor is now pointing to the end of the value prior to the one being removed.
jsonChunker, err := newJsonChunker(ctx, jsonCursor, i.m.NodeStore)
if err != nil {
return nil, false, err
}

startofRemovedLocation := jsonCursor.GetCurrentPath()
startofRemovedLocation = startofRemovedLocation.Clone()
isInitialElement := startofRemovedLocation.getScannerState().isInitialElement()

// Advance the cursor to the end of the value being removed.
keyPath.setScannerState(endOfValue)
_, err = jsonCursor.AdvanceToLocation(ctx, keyPath, false)
if err != nil {
return nil, false, err
}

// If removing the first element of an object/array, skip past the comma, and set the chunker as if it's
// at the start of the object/array.
if isInitialElement && jsonCursor.jsonScanner.current() == ',' {
jsonCursor.jsonScanner.valueOffset++
jsonChunker.jScanner.currentPath = startofRemovedLocation
}

newRoot, err := jsonChunker.Done(ctx)
if err != nil {
return nil, false, err
}

return NewIndexedJsonDocument(ctx, newRoot, i.m.NodeStore), true, nil
}

// Set is not yet implemented, so we call it on a types.JSONDocument instead.
Expand Down
71 changes: 71 additions & 0 deletions go/store/prolly/tree/json_indexed_document_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package tree
import (
"context"
"fmt"
"slices"
"strings"
"testing"

Expand Down Expand Up @@ -177,6 +178,76 @@ func TestIndexedJsonDocument_Insert(t *testing.T) {

}

func TestIndexedJsonDocument_Remove(t *testing.T) {
ctx := sql.NewEmptyContext()
ns := NewTestNodeStore()
convertToIndexedJsonDocument := func(t *testing.T, s interface{}) interface{} {
return newIndexedJsonDocumentFromValue(t, ctx, ns, s)
}

testCases := jsontests.JsonRemoveTestCases(t, convertToIndexedJsonDocument)
jsontests.RunJsonTests(t, testCases)

t.Run("large document removals", func(t *testing.T) {

largeDoc := createLargeDocumentForTesting(t, ctx, ns)

for _, chunkBoundary := range largeDocumentChunkBoundaries {
t.Run(jsonPathTypeNames[chunkBoundary.pathType], func(t *testing.T) {
// For each tested chunk boundary, get the path to the object containing that crosses that boundary, and remove it.
removalPointString := chunkBoundary.path[:strings.LastIndex(chunkBoundary.path, ".")]

removalPointLocation, err := jsonPathElementsFromMySQLJsonPath([]byte(removalPointString))
require.NoError(t, err)

// Test that the value exists prior to removal
result, err := largeDoc.Lookup(ctx, removalPointString)
require.NoError(t, err)
require.NotNil(t, result)

newDoc, changed, err := largeDoc.Remove(ctx, removalPointString)
require.NoError(t, err)
require.True(t, changed)

// test that new value is valid by calling ToInterface
v, err := newDoc.ToInterface()
require.NoError(t, err)

// If the removed value was an object key, check that the key no longer exists.
// If the removed value was an array element, check that the parent array has shrunk.
if removalPointLocation.getLastPathElement().isArrayIndex {
// Check that the parent array has shrunk.
arrayIndex := int(removalPointLocation.getLastPathElement().getArrayIndex())
parentLocation := removalPointLocation.Clone()
parentLocation.pop()

getParentArray := func(doc IndexedJsonDocument) []interface{} {
arrayWrapper, err := doc.lookupByLocation(ctx, parentLocation)
require.NoError(t, err)
arrayInterface, err := arrayWrapper.ToInterface()
require.NoError(t, err)
require.IsType(t, []interface{}{}, arrayInterface)
return arrayInterface.([]interface{})
}

oldArray := getParentArray(largeDoc)
newArray := getParentArray(newDoc.(IndexedJsonDocument))

oldArray = slices.Delete(oldArray, arrayIndex, arrayIndex+1)

require.Equal(t, oldArray, newArray)
} else {
// Check that the key no longer exists.
newJsonDocument := types.JSONDocument{Val: v}
result, err = newJsonDocument.Lookup(ctx, removalPointString)
require.NoError(t, err)
require.Nil(t, result)
}
})
}
})
}

func TestIndexedJsonDocument_Extract(t *testing.T) {
ctx := context.Background()
ns := NewTestNodeStore()
Expand Down
4 changes: 4 additions & 0 deletions go/store/prolly/tree/json_location.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ const (
endOfValue
)

func (t jsonPathType) isInitialElement() bool {
return t == objectInitialElement || t == arrayInitialElement
}

var unknownLocationKeyError = fmt.Errorf("A JSON document was written with a future version of Dolt, and the index metadata cannot be read. This will impact performance for large documents.")
var unsupportedPathError = fmt.Errorf("The supplied JSON path is not supported for optimized lookup, falling back on unoptimized implementation.")

Expand Down
Loading