olivernn · wballard · Jan 18, 2014 · Jan 18, 2014 · Jan 20, 2014
diff --git a/Makefile b/Makefile
@@ -2,8 +2,9 @@
 SRC = lib/lunr.js \
 	lib/utils.js \
 	lib/event_emitter.js \
-	lib/tokenizer.js \
 	lib/pipeline.js \
+	lib/tokenizer.js \
+	lib/ngramtokenizer.js \
 	lib/vector.js \
 	lib/sorted_set.js \
 	lib/index.js \
@@ -35,7 +36,7 @@ test_server:
 	node server.js 3000
 
 test:
-	phantomjs test/env/runner.js http://localhost:3000/test
+	PATH="./node_modules/.bin:${PATH}" phantomjs test/env/runner.js http://localhost:3000/test
 
 docs:
 	dox < lunr.js | dox-template -n lunr.js -r ${VERSION} > docs/index.html

diff --git a/docs/index.html b/docs/index.html
diff --git a/lib/index.js b/lib/index.js
@@ -18,6 +18,8 @@ lunr.Index = function () {
   this.tokenStore = new lunr.TokenStore
   this.corpusTokens = new lunr.SortedSet
   this.eventEmitter =  new lunr.EventEmitter
+  this.tokenizer = lunr.tokenizer
+  this.requireAllTerms = true
 
   this._idfCache = {}
 
@@ -75,6 +77,7 @@ lunr.Index.load = function (serialisedData) {
   idx.tokenStore = lunr.TokenStore.load(serialisedData.tokenStore)
   idx.corpusTokens = lunr.SortedSet.load(serialisedData.corpusTokens)
   idx.pipeline = lunr.Pipeline.load(serialisedData.pipeline)
+  idx.tokenizer = lunr.Pipeline.registeredFunctions[serialisedData.tokenizer] || lunr.tokenizer
 
   return idx
 }
@@ -143,9 +146,10 @@ lunr.Index.prototype.add = function (doc, emitEvent) {
       allDocumentTokens = new lunr.SortedSet,
       docRef = doc[this._ref],
       emitEvent = emitEvent === undefined ? true : emitEvent
+      self = this
 
   this._fields.forEach(function (field) {
-    var fieldTokens = this.pipeline.run(lunr.tokenizer(doc[field.name]))
+    var fieldTokens = this.pipeline.run(self.tokenizer(doc[field.name]))
 
     docTokens[field.name] = fieldTokens
     lunr.SortedSet.prototype.add.apply(allDocumentTokens, fieldTokens)
@@ -282,10 +286,11 @@ lunr.Index.prototype.idf = function (term) {
  * @memberOf Index
  */
 lunr.Index.prototype.search = function (query) {
-  var queryTokens = this.pipeline.run(lunr.tokenizer(query)),
+  var queryTokens = this.pipeline.run(this.tokenizer(query)),
       queryArr = lunr.utils.zeroFillArray(this.corpusTokens.length),
       documentSets = [],
-      fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0)
+      fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0),
+      self = this
 
   var hasSomeToken = queryTokens.some(function (token) {
     return this.tokenStore.has(token)
@@ -327,7 +332,7 @@ lunr.Index.prototype.search = function (query) {
     }, this)
 
   var documentSet = documentSets.reduce(function (memo, set) {
-    return memo.intersect(set)
+    return self.requireAllTerms ? memo.intersect(set) : memo.union(set)
   })
 
   var queryVector = new lunr.Vector (queryArr)
@@ -385,6 +390,8 @@ lunr.Index.prototype.toJSON = function () {
     documentStore: this.documentStore.toJSON(),
     tokenStore: this.tokenStore.toJSON(),
     corpusTokens: this.corpusTokens.toJSON(),
-    pipeline: this.pipeline.toJSON()
+    pipeline: this.pipeline.toJSON(),
+    tokenizer: lunr.Pipeline.warnIfFunctionNotRegistered(this.tokenizer),
+    requireAllTerms: this.requireAllTerms
   }
 }
diff --git a/lib/lunr.js b/lib/lunr.js
@@ -23,13 +23,18 @@
  *       this.field('title', 10)
  *       this.field('tags', 100)
  *       this.field('body')
- *       
+ *
  *       this.ref('cid')
- *       
+ *
  *       this.pipeline.add(function () {
  *         // some custom pipeline function
  *       })
- *       
+ *
+ *       //do this to set up with ngram matching for autocomplete style
+ *       this.tokenizer = lunr.trigramtokenizer
+ *       this.requireAllTerms = false
+ *       this.pipeline.clear()
+ *
  *     })
  *
  * @param {Function} config A function that will be called with the new instance

diff --git a/lib/ngramtokenizer.js b/lib/ngramtokenizer.js
@@ -0,0 +1,58 @@
+/*!
+ * lunr.ngramtokenizer
+ * Copyright (C) @YEAR Will Ballard
+ */
+
+/**
+ * A function making function for splitting a string into ngram
+ * tokens suitable for short string autocomplete indexing and fuzzy
+ * name matching.
+ *
+ * In order to effectively boost exact matches, a start character \u0002
+ * and an end character \u0003 are wrapped around the string and used
+ * in the ngrams. This causes a sequence of characters at the start of
+ * both a search query and a sought term to more tightly match than a similar
+ * series of characters elsewhere in sought terms.
+ *
+ * @module
+ * @param {Number} len Make character ngrams of this length
+ * @returns {Function}
+ */
+lunr.ngramtokenizer = function (len) {
+  return function(obj) {
+    if (!arguments.length || obj == null || obj == undefined) return []
+    if (Array.isArray(obj)) return obj.map(function (t) { return t.toLowerCase() })
+
+    var str = "\u0002" + obj.toString() + '\u0003';
+
+    if (str.length <= len) {
+      return [str.toLowerCase()];
+    } else {
+      var buffer = [];
+      for (var i = 0; i <= str.length - len; i++) {
+        buffer.push(str.slice(i, i + len).toLowerCase());
+      }
+      return buffer;
+    }
+  }
+}
+
+/**
+ * A tokenizer that indexes on character bigrams.
+ *
+ * @module
+ * @param {String} obj The string to convert into tokens
+ * @returns {Function}
+ */
+lunr.bigramtokenizer = lunr.ngramtokenizer(2)
+lunr.Pipeline.registerFunction(lunr.bigramtokenizer, 'bigramtokenizer')
+
+/**
+ * A tokenizer that indexes on character trigrams.
+ *
+ * @module
+ * @param {String} obj The string to convert into tokens
+ * @returns {Function}
+ */
+lunr.trigramtokenizer = lunr.ngramtokenizer(3)
+lunr.Pipeline.registerFunction(lunr.trigramtokenizer, 'trigramtokenizer')
diff --git a/lib/pipeline.js b/lib/pipeline.js
@@ -64,14 +64,17 @@ lunr.Pipeline.registerFunction = function (fn, label) {
  * Warns if the function is not registered as a Pipeline function.
  *
  * @param {Function} fn The function to check for.
- * @private
+ * @returns {String} The registered string label of the function, or undefined.
  * @memberOf Pipeline
  */
 lunr.Pipeline.warnIfFunctionNotRegistered = function (fn) {
   var isRegistered = fn.label && (fn.label in this.registeredFunctions)
 
   if (!isRegistered) {
-    lunr.utils.warn('Function is not registered with pipeline. This may cause problems when serialising the index.\n', fn)
+    lunr.utils.warn('Function is not registered with pipeline. This may cause problems when serialising the index.\n', fn);
+    return undefined;
+  } else {
+    return fn.label;
   }
 }
 
@@ -206,3 +209,12 @@ lunr.Pipeline.prototype.toJSON = function () {
     return fn.label
   })
 }
+
+/**
+ * Clears out a pipeline, removing all functions.
+ *
+ * @memberOf Pipeline
+ */
+lunr.Pipeline.prototype.clear = function () {
+  this._stack = []
+}
diff --git a/lib/tokenizer.js b/lib/tokenizer.js
@@ -30,3 +30,5 @@ lunr.tokenizer = function (obj) {
       return token.replace(/^\W+/, '').replace(/\W+$/, '').toLowerCase()
     })
 }
+
+lunr.Pipeline.registerFunction(lunr.tokenizer, 'tokenizer')