From c1dea5951fd513d77b479fcd44c77be81f757ec9 Mon Sep 17 00:00:00 2001 From: Ravishankar Date: Fri, 8 Jul 2022 03:31:13 +0530 Subject: [PATCH] [8835] Fix for CSV files surrounding space (#9028) --- .../inputformat/csv/CSVRecordReader.java | 2 +- .../csv/CSVRecordExtractorTest.java | 53 ++++++++++++++++++- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java index 471dee9ea4c..a95ee470364 100644 --- a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java +++ b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java @@ -80,7 +80,7 @@ public void init(File dataFile, @Nullable Set fieldsToRead, @Nullable Re } } char delimiter = config.getDelimiter(); - format = format.withDelimiter(delimiter); + format = format.withDelimiter(delimiter).withIgnoreSurroundingSpaces(true); String csvHeader = config.getHeader(); if (csvHeader == null) { format = format.withHeader(); diff --git a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java index d23fbd0dd70..4dea94f3989 100644 --- a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java +++ b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordExtractorTest.java @@ -105,6 +105,57 @@ protected void checkValue(Map inputRecord, GenericRow genericRow } } + @Test + public void testRemovingSurroundingSpaces() throws IOException { + CSVRecordReaderConfig csvRecordReaderConfig = new CSVRecordReaderConfig(); + + // Create a CSV file where records have two values and the second value contains an extra space. + File spaceFile = new File(_tempDir, "space.csv"); + BufferedWriter writer = new BufferedWriter(new FileWriter(spaceFile)); + writer.write("col1 ,col2\n"); + writer.write(" value11, value12"); + writer.close(); + + CSVRecordReader csvRecordReader = new CSVRecordReader(); + HashSet fieldsToRead = new HashSet<>(); + fieldsToRead.add("col1"); + fieldsToRead.add("col2"); + csvRecordReader.init(spaceFile, fieldsToRead, csvRecordReaderConfig); + GenericRow genericRow = new GenericRow(); + csvRecordReader.rewind(); + + // check if parsing succeeded. + Assert.assertTrue(csvRecordReader.hasNext()); + csvRecordReader.next(genericRow); + Assert.assertEquals(genericRow.getValue("col1"), "value11"); + Assert.assertEquals(genericRow.getValue("col2"), "value12"); + } + + @Test + public void testIgnoringSurroundingSpaces() throws IOException { + CSVRecordReaderConfig csvRecordReaderConfig = new CSVRecordReaderConfig(); + + // Create a CSV file where records have two values and the second value contains an extra space. + File spaceFile = new File(_tempDir, "space.csv"); + BufferedWriter writer = new BufferedWriter(new FileWriter(spaceFile)); + writer.write("col1 ,col2\n"); + writer.write("\"value11\",\" value12\""); + writer.close(); + + CSVRecordReader csvRecordReader = new CSVRecordReader(); + HashSet fieldsToRead = new HashSet<>(); + fieldsToRead.add("col1"); + fieldsToRead.add("col2"); + csvRecordReader.init(spaceFile, fieldsToRead, csvRecordReaderConfig); + GenericRow genericRow = new GenericRow(); + csvRecordReader.rewind(); + + // check if parsing succeeded. + Assert.assertTrue(csvRecordReader.hasNext()); + csvRecordReader.next(genericRow); + Assert.assertEquals(genericRow.getValue("col1"), "value11"); + Assert.assertEquals(genericRow.getValue("col2"), " value12"); + } /** * Check if we can parse a CSV file that has escaped comma characters within fields. */ @@ -135,6 +186,6 @@ public void testEscapeCharacterInCSV() Assert.assertTrue(csvRecordReader.hasNext()); csvRecordReader.next(genericRow); Assert.assertEquals(genericRow.getValue("first"), "string1"); - Assert.assertEquals(genericRow.getValue("second"), " string2, string3"); + Assert.assertEquals(genericRow.getValue("second"), "string2, string3"); } }