Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Created an initial pluggable tokenizer with ngram support in order to allow using lunr to drive autocomplete style search boxes. #63

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
SRC = lib/lunr.js \
lib/utils.js \
lib/event_emitter.js \
lib/tokenizer.js \
lib/pipeline.js \
lib/tokenizer.js \
lib/ngramtokenizer.js \
lib/vector.js \
lib/sorted_set.js \
lib/index.js \
Expand Down Expand Up @@ -35,7 +36,7 @@ test_server:
node server.js 3000

test:
phantomjs test/env/runner.js http://localhost:3000/test
PATH="./node_modules/.bin:${PATH}" phantomjs test/env/runner.js http://localhost:3000/test

docs:
dox < lunr.js | dox-template -n lunr.js -r ${VERSION} > docs/index.html
Expand Down
281 changes: 198 additions & 83 deletions docs/index.html

Large diffs are not rendered by default.

17 changes: 12 additions & 5 deletions lib/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ lunr.Index = function () {
this.tokenStore = new lunr.TokenStore
this.corpusTokens = new lunr.SortedSet
this.eventEmitter = new lunr.EventEmitter
this.tokenizer = lunr.tokenizer
this.requireAllTerms = true

this._idfCache = {}

Expand Down Expand Up @@ -75,6 +77,7 @@ lunr.Index.load = function (serialisedData) {
idx.tokenStore = lunr.TokenStore.load(serialisedData.tokenStore)
idx.corpusTokens = lunr.SortedSet.load(serialisedData.corpusTokens)
idx.pipeline = lunr.Pipeline.load(serialisedData.pipeline)
idx.tokenizer = lunr.Pipeline.registeredFunctions[serialisedData.tokenizer] || lunr.tokenizer

return idx
}
Expand Down Expand Up @@ -143,9 +146,10 @@ lunr.Index.prototype.add = function (doc, emitEvent) {
allDocumentTokens = new lunr.SortedSet,
docRef = doc[this._ref],
emitEvent = emitEvent === undefined ? true : emitEvent
self = this

this._fields.forEach(function (field) {
var fieldTokens = this.pipeline.run(lunr.tokenizer(doc[field.name]))
var fieldTokens = this.pipeline.run(self.tokenizer(doc[field.name]))

docTokens[field.name] = fieldTokens
lunr.SortedSet.prototype.add.apply(allDocumentTokens, fieldTokens)
Expand Down Expand Up @@ -282,10 +286,11 @@ lunr.Index.prototype.idf = function (term) {
* @memberOf Index
*/
lunr.Index.prototype.search = function (query) {
var queryTokens = this.pipeline.run(lunr.tokenizer(query)),
var queryTokens = this.pipeline.run(this.tokenizer(query)),
queryArr = lunr.utils.zeroFillArray(this.corpusTokens.length),
documentSets = [],
fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0)
fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0),
self = this

var hasSomeToken = queryTokens.some(function (token) {
return this.tokenStore.has(token)
Expand Down Expand Up @@ -327,7 +332,7 @@ lunr.Index.prototype.search = function (query) {
}, this)

var documentSet = documentSets.reduce(function (memo, set) {
return memo.intersect(set)
return self.requireAllTerms ? memo.intersect(set) : memo.union(set)
})

var queryVector = new lunr.Vector (queryArr)
Expand Down Expand Up @@ -385,6 +390,8 @@ lunr.Index.prototype.toJSON = function () {
documentStore: this.documentStore.toJSON(),
tokenStore: this.tokenStore.toJSON(),
corpusTokens: this.corpusTokens.toJSON(),
pipeline: this.pipeline.toJSON()
pipeline: this.pipeline.toJSON(),
tokenizer: lunr.Pipeline.warnIfFunctionNotRegistered(this.tokenizer),
requireAllTerms: this.requireAllTerms
}
}
11 changes: 8 additions & 3 deletions lib/lunr.js
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,18 @@
* this.field('title', 10)
* this.field('tags', 100)
* this.field('body')
*
*
* this.ref('cid')
*
*
* this.pipeline.add(function () {
* // some custom pipeline function
* })
*
*
* //do this to set up with ngram matching for autocomplete style
* this.tokenizer = lunr.trigramtokenizer
* this.requireAllTerms = false
* this.pipeline.clear()
*
* })
*
* @param {Function} config A function that will be called with the new instance
Expand Down
58 changes: 58 additions & 0 deletions lib/ngramtokenizer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*!
* lunr.ngramtokenizer
* Copyright (C) @YEAR Will Ballard
*/

/**
* A function making function for splitting a string into ngram
* tokens suitable for short string autocomplete indexing and fuzzy
* name matching.
*
* In order to effectively boost exact matches, a start character \u0002
* and an end character \u0003 are wrapped around the string and used
* in the ngrams. This causes a sequence of characters at the start of
* both a search query and a sought term to more tightly match than a similar
* series of characters elsewhere in sought terms.
*
* @module
* @param {Number} len Make character ngrams of this length
* @returns {Function}
*/
lunr.ngramtokenizer = function (len) {
return function(obj) {
if (!arguments.length || obj == null || obj == undefined) return []
if (Array.isArray(obj)) return obj.map(function (t) { return t.toLowerCase() })

var str = "\u0002" + obj.toString() + '\u0003';

if (str.length <= len) {
return [str.toLowerCase()];
} else {
var buffer = [];
for (var i = 0; i <= str.length - len; i++) {
buffer.push(str.slice(i, i + len).toLowerCase());
}
return buffer;
}
}
}

/**
* A tokenizer that indexes on character bigrams.
*
* @module
* @param {String} obj The string to convert into tokens
* @returns {Function}
*/
lunr.bigramtokenizer = lunr.ngramtokenizer(2)
lunr.Pipeline.registerFunction(lunr.bigramtokenizer, 'bigramtokenizer')

/**
* A tokenizer that indexes on character trigrams.
*
* @module
* @param {String} obj The string to convert into tokens
* @returns {Function}
*/
lunr.trigramtokenizer = lunr.ngramtokenizer(3)
lunr.Pipeline.registerFunction(lunr.trigramtokenizer, 'trigramtokenizer')
16 changes: 14 additions & 2 deletions lib/pipeline.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,17 @@ lunr.Pipeline.registerFunction = function (fn, label) {
* Warns if the function is not registered as a Pipeline function.
*
* @param {Function} fn The function to check for.
* @private
* @returns {String} The registered string label of the function, or undefined.
* @memberOf Pipeline
*/
lunr.Pipeline.warnIfFunctionNotRegistered = function (fn) {
var isRegistered = fn.label && (fn.label in this.registeredFunctions)

if (!isRegistered) {
lunr.utils.warn('Function is not registered with pipeline. This may cause problems when serialising the index.\n', fn)
lunr.utils.warn('Function is not registered with pipeline. This may cause problems when serialising the index.\n', fn);
return undefined;
} else {
return fn.label;
}
}

Expand Down Expand Up @@ -206,3 +209,12 @@ lunr.Pipeline.prototype.toJSON = function () {
return fn.label
})
}

/**
* Clears out a pipeline, removing all functions.
*
* @memberOf Pipeline
*/
lunr.Pipeline.prototype.clear = function () {
this._stack = []
}
2 changes: 2 additions & 0 deletions lib/tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,5 @@ lunr.tokenizer = function (obj) {
return token.replace(/^\W+/, '').replace(/\W+$/, '').toLowerCase()
})
}

lunr.Pipeline.registerFunction(lunr.tokenizer, 'tokenizer')
Loading