Skip to content

Commit

Permalink
Fix parseToken array popping (#13620) (#15235)
Browse files Browse the repository at this point in the history
* Fix parseToken array popping



* Add fix description



* Refactor JsonToStringXContentParser#parseToken

This method was pretty complicated, hard to follow, and therefore prone
to bugs.

This change introduces a proper stack (rather than a StringBuilder) to
keep track of fields under fields. When ever we parse a field name, we
push it onto the stack, recursively parse its value (an object, an
array, or a primitive value), then we pop (guaranteeing pushes and pops
are balanced).



* Remove use of org.apache.logging.log4j.util.Strings



* Make sure JsonToStringXContentParser ends on END_OBJECT

There is logic in DocumentParser that expects any object parser to start
with currentToken() pointing to START_OBJECT, and return with
currentToken() pointing to END_OBJECT.

My previous commits were stepping over the start/end, and returning on
the token following the end.



* Throw exception when flat object is null

The existing behavior throws an exception on null flat object by
advancing the parser beyond the end of the flat object input.

We could simply skip a null flat_object field, but this would be
a behavioral change from 2.7.



---------




(cherry picked from commit 6e27e5a)

Signed-off-by: naomichi-y <n.yamakita@gmail.com>
Signed-off-by: Michael Froh <froh@amazon.com>
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Michael Froh <froh@amazon.com>
  • Loading branch information
3 people committed Aug 13, 2024
1 parent d07a3d6 commit 044f564
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 72 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Fix missing value of FieldSort for unsigned_long ([#14963](https://github.com/opensearch-project/OpenSearch/pull/14963))
- Fix delete index template failed when the index template matches a data stream but is unused ([#15080](https://github.com/opensearch-project/OpenSearch/pull/15080))
- Fix array_index_out_of_bounds_exception when indexing documents with field name containing only dot ([#15126](https://github.com/opensearch-project/OpenSearch/pull/15126))
- Fixed array field name omission in flat_object function for nested JSON ([#13620](https://github.com/opensearch-project/OpenSearch/pull/13620))

### Security

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
package org.opensearch.common.xcontent;

import org.opensearch.common.xcontent.json.JsonXContent;
import org.opensearch.core.common.Strings;
import org.opensearch.core.common.bytes.BytesReference;
import org.opensearch.core.xcontent.AbstractXContentParser;
import org.opensearch.core.xcontent.DeprecationHandler;
Expand All @@ -23,6 +24,9 @@
import java.math.BigInteger;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Deque;
import java.util.LinkedList;

/**
* JsonToStringParser is the main parser class to transform JSON into stringFields in a XContentParser
Expand All @@ -32,21 +36,20 @@
*/
public class JsonToStringXContentParser extends AbstractXContentParser {
private final String fieldTypeName;
private XContentParser parser;
private final XContentParser parser;

private ArrayList<String> valueList = new ArrayList<>();
private ArrayList<String> valueAndPathList = new ArrayList<>();
private ArrayList<String> keyList = new ArrayList<>();
private final ArrayList<String> valueList = new ArrayList<>();
private final ArrayList<String> valueAndPathList = new ArrayList<>();
private final ArrayList<String> keyList = new ArrayList<>();

private XContentBuilder builder = XContentBuilder.builder(JsonXContent.jsonXContent);
private final XContentBuilder builder = XContentBuilder.builder(JsonXContent.jsonXContent);

private NamedXContentRegistry xContentRegistry;
private final NamedXContentRegistry xContentRegistry;

private DeprecationHandler deprecationHandler;
private final DeprecationHandler deprecationHandler;

private static final String VALUE_AND_PATH_SUFFIX = "._valueAndPath";
private static final String VALUE_SUFFIX = "._value";
private static final String DOT_SYMBOL = ".";
private static final String EQUAL_SYMBOL = "=";

public JsonToStringXContentParser(
Expand All @@ -63,9 +66,14 @@ public JsonToStringXContentParser(
}

public XContentParser parseObject() throws IOException {
assert currentToken() == Token.START_OBJECT;
parser.nextToken(); // Skip the outer START_OBJECT. Need to return on END_OBJECT.

builder.startObject();
StringBuilder path = new StringBuilder(fieldTypeName);
parseToken(path, null);
LinkedList<String> path = new LinkedList<>(Collections.singleton(fieldTypeName));
while (currentToken() != Token.END_OBJECT) {
parseToken(path);
}
builder.field(this.fieldTypeName, keyList);
builder.field(this.fieldTypeName + VALUE_SUFFIX, valueList);
builder.field(this.fieldTypeName + VALUE_AND_PATH_SUFFIX, valueAndPathList);
Expand All @@ -74,75 +82,55 @@ public XContentParser parseObject() throws IOException {
return JsonXContent.jsonXContent.createParser(this.xContentRegistry, this.deprecationHandler, String.valueOf(jString));
}

private void parseToken(StringBuilder path, String currentFieldName) throws IOException {

while (this.parser.nextToken() != Token.END_OBJECT) {
if (this.parser.currentName() != null) {
currentFieldName = this.parser.currentName();
private void parseToken(Deque<String> path) throws IOException {
if (this.parser.currentToken() == Token.FIELD_NAME) {
String fieldName = this.parser.currentName();
path.addLast(fieldName); // Pushing onto the stack *must* be matched by pop
String parts = fieldName;
while (parts.contains(".")) { // Extract the intermediate keys maybe present in fieldName
int dotPos = parts.indexOf('.');
String part = parts.substring(0, dotPos);
this.keyList.add(part);
parts = parts.substring(dotPos + 1);
}
StringBuilder parsedFields = new StringBuilder();

if (this.parser.currentToken() == Token.FIELD_NAME) {
path.append(DOT_SYMBOL).append(currentFieldName);
int dotIndex = currentFieldName.indexOf(DOT_SYMBOL);
String fieldNameSuffix = currentFieldName;
// The field name may be of the form foo.bar.baz
// If that's the case, each "part" is a key.
while (dotIndex >= 0) {
String fieldNamePrefix = fieldNameSuffix.substring(0, dotIndex);
if (!fieldNamePrefix.isEmpty()) {
this.keyList.add(fieldNamePrefix);
}
fieldNameSuffix = fieldNameSuffix.substring(dotIndex + 1);
dotIndex = fieldNameSuffix.indexOf(DOT_SYMBOL);
}
if (!fieldNameSuffix.isEmpty()) {
this.keyList.add(fieldNameSuffix);
}
} else if (this.parser.currentToken() == Token.START_ARRAY) {
parseToken(path, currentFieldName);
break;
} else if (this.parser.currentToken() == Token.END_ARRAY) {
// skip
} else if (this.parser.currentToken() == Token.START_OBJECT) {
parseToken(path, currentFieldName);
int dotIndex = path.lastIndexOf(DOT_SYMBOL, path.length());

if (dotIndex != -1 && path.length() > currentFieldName.length()) {
path.setLength(path.length() - currentFieldName.length() - 1);
}
} else {
if (!path.toString().contains(currentFieldName)) {
path.append(DOT_SYMBOL).append(currentFieldName);
}
parseValue(parsedFields);
this.valueList.add(parsedFields.toString());
this.valueAndPathList.add(path + EQUAL_SYMBOL + parsedFields);
int dotIndex = path.lastIndexOf(DOT_SYMBOL, path.length());
if (dotIndex != -1 && path.length() > currentFieldName.length()) {
path.setLength(path.length() - currentFieldName.length() - 1);
}
this.keyList.add(parts); // parts has no dot, so either it's the original fieldName or it's the last part
this.parser.nextToken(); // advance to the value of fieldName
parseToken(path); // parse the value for fieldName (which will be an array, an object, or a primitive value)
path.removeLast(); // Here is where we pop fieldName from the stack (since we're done with the value of fieldName)
// Note that whichever other branch we just passed through has already ended with nextToken(), so we
// don't need to call it.
} else if (this.parser.currentToken() == Token.START_ARRAY) {
parser.nextToken();
while (this.parser.currentToken() != Token.END_ARRAY) {
parseToken(path);
}

this.parser.nextToken();
} else if (this.parser.currentToken() == Token.START_OBJECT) {
parser.nextToken();
while (this.parser.currentToken() != Token.END_OBJECT) {
parseToken(path);
}
this.parser.nextToken();
} else if (this.parser.currentToken().isValue()) {
String parsedValue = parseValue();
if (parsedValue != null) {
this.valueList.add(parsedValue);
this.valueAndPathList.add(Strings.collectionToDelimitedString(path, ".") + EQUAL_SYMBOL + parsedValue);
}
this.parser.nextToken();
}
}

private void parseValue(StringBuilder parsedFields) throws IOException {
private String parseValue() throws IOException {
switch (this.parser.currentToken()) {
case VALUE_BOOLEAN:
case VALUE_NUMBER:
case VALUE_STRING:
case VALUE_NULL:
parsedFields.append(this.parser.textOrNull());
break;
return this.parser.textOrNull();
// Handle other token types as needed
case FIELD_NAME:
case VALUE_EMBEDDED_OBJECT:
case END_ARRAY:
case START_ARRAY:
break;
default:
throw new IOException("Unsupported token type [" + parser.currentToken() + "]");
throw new IOException("Unsupported value token type [" + parser.currentToken() + "]");
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -568,8 +568,12 @@ protected void parseCreateField(ParseContext context) throws IOException {
if (context.externalValueSet()) {
String value = context.externalValue().toString();
parseValueAddFields(context, value, fieldType().name());
} else if (context.parser().currentToken() == XContentParser.Token.VALUE_NULL) {
context.parser().nextToken(); // This triggers an exception in DocumentParser.
// We could remove the above nextToken() call to skip the null value, but the existing
// behavior (since 2.7) throws the exception.
} else {
JsonToStringXContentParser JsonToStringParser = new JsonToStringXContentParser(
JsonToStringXContentParser jsonToStringParser = new JsonToStringXContentParser(
NamedXContentRegistry.EMPTY,
DeprecationHandler.IGNORE_DEPRECATIONS,
context.parser(),
Expand All @@ -579,7 +583,7 @@ protected void parseCreateField(ParseContext context) throws IOException {
JsonToStringParser is the main parser class to transform JSON into stringFields in a XContentParser
It reads the JSON object and parsed to a list of string
*/
XContentParser parser = JsonToStringParser.parseObject();
XContentParser parser = jsonToStringParser.parseObject();

XContentParser.Token currentToken;
while ((currentToken = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
Expand All @@ -594,7 +598,6 @@ protected void parseCreateField(ParseContext context) throws IOException {
}

}

}

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
public class JsonToStringXContentParserTests extends OpenSearchTestCase {

private String flattenJsonString(String fieldName, String in) throws IOException {
String transformed;
try (
XContentParser parser = JsonXContent.jsonXContent.createParser(
xContentRegistry(),
Expand All @@ -33,10 +32,11 @@ private String flattenJsonString(String fieldName, String in) throws IOException
parser,
fieldName
);
// Skip the START_OBJECT token:
// Point to the first token (should be START_OBJECT)
jsonToStringXContentParser.nextToken();

XContentParser transformedParser = jsonToStringXContentParser.parseObject();
assertSame(XContentParser.Token.END_OBJECT, jsonToStringXContentParser.currentToken());
try (XContentBuilder jsonBuilder = XContentFactory.jsonBuilder()) {
jsonBuilder.copyCurrentStructure(transformedParser);
return jsonBuilder.toString();
Expand Down Expand Up @@ -110,4 +110,57 @@ public void testNestChildObjectWithDotsAndFieldWithDots() throws IOException {
);
}

public void testArrayOfObjects() throws IOException {
String jsonExample = "{"
+ "\"field\": {"
+ " \"detail\": {"
+ " \"foooooooooooo\": ["
+ " {\"name\":\"baz\"},"
+ " {\"name\":\"baz\"}"
+ " ]"
+ " }"
+ "}}";

assertEquals(
"{"
+ "\"flat\":[\"field\",\"detail\",\"foooooooooooo\",\"name\",\"name\"],"
+ "\"flat._value\":[\"baz\",\"baz\"],"
+ "\"flat._valueAndPath\":["
+ "\"flat.field.detail.foooooooooooo.name=baz\","
+ "\"flat.field.detail.foooooooooooo.name=baz\""
+ "]}",
flattenJsonString("flat", jsonExample)
);
}

public void testArraysOfObjectsAndValues() throws IOException {
String jsonExample = "{"
+ "\"field\": {"
+ " \"detail\": {"
+ " \"foooooooooooo\": ["
+ " {\"name\":\"baz\"},"
+ " {\"name\":\"baz\"}"
+ " ]"
+ " },"
+ " \"numbers\" : ["
+ " 1,"
+ " 2,"
+ " 3"
+ " ]"
+ "}}";

assertEquals(
"{"
+ "\"flat\":[\"field\",\"detail\",\"foooooooooooo\",\"name\",\"name\",\"numbers\"],"
+ "\"flat._value\":[\"baz\",\"baz\",\"1\",\"2\",\"3\"],"
+ "\"flat._valueAndPath\":["
+ "\"flat.field.detail.foooooooooooo.name=baz\","
+ "\"flat.field.detail.foooooooooooo.name=baz\","
+ "\"flat.field.numbers=1\","
+ "\"flat.field.numbers=2\","
+ "\"flat.field.numbers=3\""
+ "]}",
flattenJsonString("flat", jsonExample)
);
}
}

0 comments on commit 044f564

Please sign in to comment.