diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e89a6f4..dadaaf0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -26,7 +26,7 @@ jobs: - name: Run Release run: | - java -jar AcrosticSleuth.jar -input data/demo -language EN -mode LINE -charset utf-8 -outputSize 4000 --concise > result.tsv + java -jar AcrosticSleuth.jar -input data/demo -language EN -charset utf-8 -outputSize 4000 --concise > result.tsv - name: Test Release on Mac or Ubuntu if: matrix.os != 'windows-latest' @@ -46,7 +46,7 @@ jobs: run: | javac -encoding UTF-8 -d out -cp src src/acrosticsleuth/*.java jar cfe AcrosticSleuth.jar acrosticsleuth.Main -C out . -C models . - java -jar AcrosticSleuth.jar -input data/demo -language EN -mode LINE -charset utf-8 -outputSize 4000 --concise > result.tsv + java -jar AcrosticSleuth.jar -input data/demo -language EN -charset utf-8 -outputSize 4000 --concise > result.tsv - name: Test Build on Mac or Ubuntu if: matrix.os != 'windows-latest' diff --git a/AcrosticSleuth.jar b/AcrosticSleuth.jar index 4f019a3..d37516d 100644 Binary files a/AcrosticSleuth.jar and b/AcrosticSleuth.jar differ diff --git a/README.md b/README.md index 4366563..b0f1ae3 100644 --- a/README.md +++ b/README.md @@ -27,13 +27,12 @@ This repository includes a demo dataset comprising a subset of pages with acrost You can test AcrosticSleuth on this small dataset using: ```bash -java -jar AcrosticSleuth.jar -input data/demo -language EN -mode LINE -charset utf-8 -outputSize 4000 --concise +java -jar AcrosticSleuth.jar -input data/demo -language EN -charset utf-8 -outputSize 4000 --concise ``` Here is the meaning behind each of the options used: - `-input data/demo`: analyze all texts in the `data/demo` directory - `-language EN`: use the default English language model -- `-mode LINE`: search for line acrostics (where an acrostic is formed by the initial letters of each line) - `-charset utf-8`: use the utf-8 encoding when opening the files - `-outputSize 4000`: return top 4000 instances (AcrosticSleuth clusters collocated instances, so the actual number of results it returns is much smaller -- 46) - `--concise`: only report key information (file,acrostic,rank). diff --git a/src/acrosticsleuth/CLO.java b/src/acrosticsleuth/CLO.java index 9831a46..dfa6328 100644 --- a/src/acrosticsleuth/CLO.java +++ b/src/acrosticsleuth/CLO.java @@ -12,40 +12,43 @@ import acrosticsleuth.CommandLine.Option; import acrosticsleuth.CommandLine.Command; -@Command(name = "CLO", mixinStandardHelpOptions = true, description = "Scout a corpus for acrostics") +@Command(name = "CLO", mixinStandardHelpOptions = true, description = "Search a corpus for acrostics") public class CLO implements Callable { - @Option(names = {"-input", "--input"}, required = true, description = "File or directory with all texts of interest") + @Option(names = {"-input", "--input"}, required = true, description = "Directory with input texts") private String input; // list of all texts to be processed public File[] texts; - @Option(names = {"-outputSize", "--outputSize"}, description = "Max number of potential acrostics to print") + @Option(names = {"-outputSize", "--outputSize"}, description = "Max number of results to print") public int outputSize = OUTPUT_SIZE_DEFAULT; public LanguageModel languageModel; public CharModel charModel; - @Option(names = {"-maxLength", "--maxLength"}, description = "Maximum length of an acrostic (in characters).") + @Option(names = {"-maxLength", "--maxLength"}, description = "Maximum allowed length of an acrostic", hidden = true) public int maxLength = MAX_LENGTH_DEFAULT; @Option(names = {"-workers", "--workers"}, description = "Number of threads to use") public int workers = WORKERS_DEFAULT; - @Option(names = {"-mode", "--mode"}, description = "Look for acrostics formed by the first letter of each LINE or WORD") + @Option(names = {"-mode", "--mode"}, description = "Look for acrostics formed by the first letters of each LINE or WORD", hidden = true) public Mode mode = MODE_DEFAULT; - @Option(names = {"-charset", "--charset"}, description = "Name of the character encoding to use. Supports utf-8 and windows-1251") + @Option(names = {"-charset", "--charset"}, description = "utf-8 or windows-1251") public Charset charset = CHARSET_DEFAULT; - @Option(names = {"-language", "--language"}, required = true, description = "Determines the language of the text: EN, LA, RU, FR") + @Option(names = {"-language", "--language"}, required = true, description = "EN, LA, RU, or FR") public Language language; - @Option(names = {"-concise", "--concise"}, description = "Report minimal information -- only the acrostic, the page it comes from, and the rank") + @Option(names = {"-concise", "--concise"}, description = "Report results concisely") public boolean concise; - @Option(names = {"-wikisource", "--wikisource"}, description = "Use if the input is a parsed WikiSource database, where there might be several texts per file.") + @Option(names = {"-wikisource", "--wikisource"}, description = "Use if running on WikiSource") public boolean wikisource; + @Option(names = {"-help", "--help"}, description = "Show this help message", usageHelp = true) + public boolean help; + public static final int MAX_LENGTH_DEFAULT = 50; public static final int OUTPUT_SIZE_DEFAULT = 10000; public static final int WORKERS_DEFAULT = 1;