Skip to content

Commit

Permalink
[8835] Fix for CSV files surrounding space (#9028)
Browse files Browse the repository at this point in the history
  • Loading branch information
ravishankar15 authored Jul 7, 2022
1 parent 1bcc032 commit c1dea59
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ public void init(File dataFile, @Nullable Set<String> fieldsToRead, @Nullable Re
}
}
char delimiter = config.getDelimiter();
format = format.withDelimiter(delimiter);
format = format.withDelimiter(delimiter).withIgnoreSurroundingSpaces(true);
String csvHeader = config.getHeader();
if (csvHeader == null) {
format = format.withHeader();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,57 @@ protected void checkValue(Map<String, Object> inputRecord, GenericRow genericRow
}
}

@Test
public void testRemovingSurroundingSpaces() throws IOException {
CSVRecordReaderConfig csvRecordReaderConfig = new CSVRecordReaderConfig();

// Create a CSV file where records have two values and the second value contains an extra space.
File spaceFile = new File(_tempDir, "space.csv");
BufferedWriter writer = new BufferedWriter(new FileWriter(spaceFile));
writer.write("col1 ,col2\n");
writer.write(" value11, value12");
writer.close();

CSVRecordReader csvRecordReader = new CSVRecordReader();
HashSet<String> fieldsToRead = new HashSet<>();
fieldsToRead.add("col1");
fieldsToRead.add("col2");
csvRecordReader.init(spaceFile, fieldsToRead, csvRecordReaderConfig);
GenericRow genericRow = new GenericRow();
csvRecordReader.rewind();

// check if parsing succeeded.
Assert.assertTrue(csvRecordReader.hasNext());
csvRecordReader.next(genericRow);
Assert.assertEquals(genericRow.getValue("col1"), "value11");
Assert.assertEquals(genericRow.getValue("col2"), "value12");
}

@Test
public void testIgnoringSurroundingSpaces() throws IOException {
CSVRecordReaderConfig csvRecordReaderConfig = new CSVRecordReaderConfig();

// Create a CSV file where records have two values and the second value contains an extra space.
File spaceFile = new File(_tempDir, "space.csv");
BufferedWriter writer = new BufferedWriter(new FileWriter(spaceFile));
writer.write("col1 ,col2\n");
writer.write("\"value11\",\" value12\"");
writer.close();

CSVRecordReader csvRecordReader = new CSVRecordReader();
HashSet<String> fieldsToRead = new HashSet<>();
fieldsToRead.add("col1");
fieldsToRead.add("col2");
csvRecordReader.init(spaceFile, fieldsToRead, csvRecordReaderConfig);
GenericRow genericRow = new GenericRow();
csvRecordReader.rewind();

// check if parsing succeeded.
Assert.assertTrue(csvRecordReader.hasNext());
csvRecordReader.next(genericRow);
Assert.assertEquals(genericRow.getValue("col1"), "value11");
Assert.assertEquals(genericRow.getValue("col2"), " value12");
}
/**
* Check if we can parse a CSV file that has escaped comma characters within fields.
*/
Expand Down Expand Up @@ -135,6 +186,6 @@ public void testEscapeCharacterInCSV()
Assert.assertTrue(csvRecordReader.hasNext());
csvRecordReader.next(genericRow);
Assert.assertEquals(genericRow.getValue("first"), "string1");
Assert.assertEquals(genericRow.getValue("second"), " string2, string3");
Assert.assertEquals(genericRow.getValue("second"), "string2, string3");
}
}

0 comments on commit c1dea59

Please sign in to comment.