Skip to content

Commit

Permalink
Add support for sentences
Browse files Browse the repository at this point in the history
  • Loading branch information
mwhirls committed Jan 29, 2024
1 parent 696c51c commit 8fa5073
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 2 deletions.
19 changes: 18 additions & 1 deletion src/internal/segmenter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,18 @@ function nextWord(cursor: TokenCursor): IpadicNode {
}
}

function isSentenceDelimiter(cursor: TokenCursor) {
const token = cursor.token();
const punctuation = ['。', '!', '?', '…'];
const next = cursor.next();
if (!next) {
return punctuation.includes(token.surface_form);
}
const endQuotes = ['」', '】', '』'];
return punctuation.includes(token.surface_form) &&
!endQuotes.includes(next.token().surface_form);
}

function nextSentence(tokens: kuromoji.IpadicFeatures[], start: number): IpadicSentence {
const result = [];
let index = start;
Expand All @@ -480,7 +492,12 @@ function nextSentence(tokens: kuromoji.IpadicFeatures[], start: number): IpadicS
const word = new IpadicWord(root);
index += word.tokens.length;
result.push(word);
// todo: break sentence
const last = index - 1 < tokens.length ? index - 1 : undefined;
if (last) {
if (isSentenceDelimiter(new TokenCursor(tokens, last))) {
return { words: result, start, end: index };
}
}
}
return { words: result, start, end: index };
}
Expand Down
72 changes: 71 additions & 1 deletion test/segmenter.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ describe('Segmenter', function () {
});
});

describe('PartOfSpeech', function () {
describe('segmentAsWords', function () {
// Using async in 'describe()' can cause the test not
// to print: https://github.com/mochajs/mocha/issues/2975
const context: TestContext = { segmenter: null };
Expand Down Expand Up @@ -79,4 +79,74 @@ describe('Segmenter', function () {
copula.runTestSuite(context);
conjunction.runTestSuite(context);
});

describe('segmentAsSentences', function () {
const context: TestContext = { segmenter: null };
before(async () => {
context.segmenter = await bunsetsu.build(DICTIONARY_PATH);
});

describe('no punctuation', async function () {
const sentence = 'これはペンです';
it(`should identify ${sentence} as one sentence`, function () {
assert.ok(context.segmenter);
const sentences = context.segmenter.segmentAsSentences(sentence);
assert.equal(sentences.length, 1);
const s0 = sentences[0];
assert.equal(s0.words.length, 4);
});
});

describe('with full stop', async function () {
const sentence = 'これはペンです。あれもペンです。';
it(`should identify ${sentence} as two sentences`, function () {
assert.ok(context.segmenter);
const sentences = context.segmenter.segmentAsSentences(sentence);
assert.equal(sentences.length, 2);
const s0 = sentences[0];
const s1 = sentences[1];
assert.equal(s0.words.length, 5);
assert.equal(s1.words.length, 5);
});
});

describe('with question mark', async function () {
const sentence = 'これはペンですか?あれもペンです。';
it(`should identify ${sentence} as two sentences`, function () {
assert.ok(context.segmenter);
const sentences = context.segmenter.segmentAsSentences(sentence);
assert.equal(sentences.length, 2);
const s0 = sentences[0];
const s1 = sentences[1];
assert.equal(s0.words.length, 6);
assert.equal(s1.words.length, 5);
});
});

describe('with exclamation mark', async function () {
const sentence = 'これはペンですよ!あれもペンですよ!';
it(`should identify ${sentence} as two sentences`, function () {
assert.ok(context.segmenter);
const sentences = context.segmenter.segmentAsSentences(sentence);
assert.equal(sentences.length, 2);
const s0 = sentences[0];
const s1 = sentences[1];
assert.equal(s0.words.length, 6);
assert.equal(s1.words.length, 6);
});
});

describe('with tententen', async function () {
const sentence = 'これはペンです…あれもペンです…';
it(`should identify ${sentence} as two sentences`, function () {
assert.ok(context.segmenter);
const sentences = context.segmenter.segmentAsSentences(sentence);
assert.equal(sentences.length, 2);
const s0 = sentences[0];
const s1 = sentences[1];
assert.equal(s0.words.length, 5);
assert.equal(s1.words.length, 5);
});
});
});
});

0 comments on commit 8fa5073

Please sign in to comment.