-
Notifications
You must be signed in to change notification settings - Fork 5
/
CiffToCsv.java
81 lines (67 loc) · 2.97 KB
/
CiffToCsv.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
package nl.ru.convert;
import org.kohsuke.args4j.*;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
public class CiffToCsv {
public static class Args {
@Option(name = "-input", metaVar = "[file]", required = true, usage = "postings file")
public String input = "";
@Option(name = "-dict", metaVar = "[file]", required = true, usage = "output dict file")
public String dict = "";
@Option(name = "-terms", metaVar = "[file]", required = true, usage = "output terms file")
public String terms = "";
@Option(name = "-docs", metaVar = "[file]", required = true, usage = "output docs file")
public String docs = "";
}
public static void main(String[] argv) throws Exception {
CiffToCsv.Args args = new CiffToCsv.Args();
CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(argv);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: ExportToOldDog " + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
InputStream fileIn;
if (args.input.endsWith(".gz")) {
fileIn = new GZIPInputStream(new FileInputStream(args.input));
} else {
fileIn = new FileInputStream(args.input);
}
BufferedWriter dictWriter = new BufferedWriter(new FileWriter(args.dict));
BufferedWriter termsWriter = new BufferedWriter(new FileWriter(args.terms));
BufferedWriter docsWriter = new BufferedWriter(new FileWriter(args.docs));
CommonIndexFileFormat.Header header = CommonIndexFileFormat.Header.parseDelimitedFrom(fileIn);
// Create terms + dict files
for (int termID=0; termID<header.getNumPostingsLists(); termID++) {
CommonIndexFileFormat.PostingsList pl = CommonIndexFileFormat.PostingsList.parseDelimitedFrom(fileIn);
if (pl.getDf() != pl.getPostingsCount()) {
throw new RuntimeException(String.format(
"Unexpected number of postings! expected %d got %d", pl.getDf(), pl.getPostingsCount()));
}
dictWriter.write(Integer.toString(termID) + '|' + pl.getTerm() + '|' + pl.getDf());
dictWriter.newLine();
int docID = 0;
for (int j=0; j< pl.getDf(); j++) {
docID += pl.getPostings(j).getDocid();
termsWriter.write(Long.toString(termID) + '|' + docID + '|' + Long.toString(pl.getPostings(j).getTf()));
termsWriter.newLine();
}
}
dictWriter.close();
termsWriter.close();
// Create docs file
for (int i=0; i<header.getNumDocs(); i++) {
CommonIndexFileFormat.DocRecord docRecord = CommonIndexFileFormat.DocRecord.parseDelimitedFrom(fileIn);
docsWriter.write(docRecord.getCollectionDocid() + '|' + docRecord.getDocid() + '|' + docRecord.getDoclength());
docsWriter.newLine();
}
docsWriter.close();
fileIn.close();
}
}