470 lines
14 KiB
JavaScript
470 lines
14 KiB
JavaScript
|
/*!
|
||
|
* lunr.Index
|
||
|
* Copyright (C) @YEAR Oliver Nightingale
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* lunr.Index is object that manages a search index. It contains the indexes
|
||
|
* and stores all the tokens and document lookups. It also provides the main
|
||
|
* user facing API for the library.
|
||
|
*
|
||
|
* @constructor
|
||
|
*/
|
||
|
lunr.Index = function () {
|
||
|
this._fields = []
|
||
|
this._ref = 'id'
|
||
|
this.pipeline = new lunr.Pipeline
|
||
|
this.documentStore = new lunr.Store
|
||
|
this.tokenStore = new lunr.TokenStore
|
||
|
this.corpusTokens = new lunr.SortedSet
|
||
|
this.eventEmitter = new lunr.EventEmitter
|
||
|
this.tokenizerFn = lunr.tokenizer
|
||
|
|
||
|
this._idfCache = {}
|
||
|
|
||
|
this.on('add', 'remove', 'update', (function () {
|
||
|
this._idfCache = {}
|
||
|
}).bind(this))
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Bind a handler to events being emitted by the index.
|
||
|
*
|
||
|
* The handler can be bound to many events at the same time.
|
||
|
*
|
||
|
* @param {String} [eventName] The name(s) of events to bind the function to.
|
||
|
* @param {Function} fn The serialised set to load.
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.on = function () {
|
||
|
var args = Array.prototype.slice.call(arguments)
|
||
|
return this.eventEmitter.addListener.apply(this.eventEmitter, args)
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Removes a handler from an event being emitted by the index.
|
||
|
*
|
||
|
* @param {String} eventName The name of events to remove the function from.
|
||
|
* @param {Function} fn The serialised set to load.
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.off = function (name, fn) {
|
||
|
return this.eventEmitter.removeListener(name, fn)
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Loads a previously serialised index.
|
||
|
*
|
||
|
* Issues a warning if the index being imported was serialised
|
||
|
* by a different version of lunr.
|
||
|
*
|
||
|
* @param {Object} serialisedData The serialised set to load.
|
||
|
* @returns {lunr.Index}
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.load = function (serialisedData) {
|
||
|
if (serialisedData.version !== lunr.version) {
|
||
|
lunr.utils.warn('version mismatch: current ' + lunr.version + ' importing ' + serialisedData.version)
|
||
|
}
|
||
|
|
||
|
var idx = new this
|
||
|
|
||
|
idx._fields = serialisedData.fields
|
||
|
idx._ref = serialisedData.ref
|
||
|
|
||
|
idx.tokenizer = lunr.tokenizer.load(serialisedData.tokenizer)
|
||
|
idx.documentStore = lunr.Store.load(serialisedData.documentStore)
|
||
|
idx.tokenStore = lunr.TokenStore.load(serialisedData.tokenStore)
|
||
|
idx.corpusTokens = lunr.SortedSet.load(serialisedData.corpusTokens)
|
||
|
idx.pipeline = lunr.Pipeline.load(serialisedData.pipeline)
|
||
|
|
||
|
return idx
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Adds a field to the list of fields that will be searchable within documents
|
||
|
* in the index.
|
||
|
*
|
||
|
* An optional boost param can be passed to affect how much tokens in this field
|
||
|
* rank in search results, by default the boost value is 1.
|
||
|
*
|
||
|
* Fields should be added before any documents are added to the index, fields
|
||
|
* that are added after documents are added to the index will only apply to new
|
||
|
* documents added to the index.
|
||
|
*
|
||
|
* @param {String} fieldName The name of the field within the document that
|
||
|
* should be indexed
|
||
|
* @param {Number} boost An optional boost that can be applied to terms in this
|
||
|
* field.
|
||
|
* @returns {lunr.Index}
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.field = function (fieldName, opts) {
|
||
|
var opts = opts || {},
|
||
|
field = { name: fieldName, boost: opts.boost || 1 }
|
||
|
|
||
|
this._fields.push(field)
|
||
|
return this
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Sets the property used to uniquely identify documents added to the index,
|
||
|
* by default this property is 'id'.
|
||
|
*
|
||
|
* This should only be changed before adding documents to the index, changing
|
||
|
* the ref property without resetting the index can lead to unexpected results.
|
||
|
*
|
||
|
* The value of ref can be of any type but it _must_ be stably comparable and
|
||
|
* orderable.
|
||
|
*
|
||
|
* @param {String} refName The property to use to uniquely identify the
|
||
|
* documents in the index.
|
||
|
* @param {Boolean} emitEvent Whether to emit add events, defaults to true
|
||
|
* @returns {lunr.Index}
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.ref = function (refName) {
|
||
|
this._ref = refName
|
||
|
return this
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Sets the tokenizer used for this index.
|
||
|
*
|
||
|
* By default the index will use the default tokenizer, lunr.tokenizer. The tokenizer
|
||
|
* should only be changed before adding documents to the index. Changing the tokenizer
|
||
|
* without re-building the index can lead to unexpected results.
|
||
|
*
|
||
|
* @param {Function} fn The function to use as a tokenizer.
|
||
|
* @returns {lunr.Index}
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.tokenizer = function (fn) {
|
||
|
var isRegistered = fn.label && (fn.label in lunr.tokenizer.registeredFunctions)
|
||
|
|
||
|
if (!isRegistered) {
|
||
|
lunr.utils.warn('Function is not a registered tokenizer. This may cause problems when serialising the index')
|
||
|
}
|
||
|
|
||
|
this.tokenizerFn = fn
|
||
|
return this
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Add a document to the index.
|
||
|
*
|
||
|
* This is the way new documents enter the index, this function will run the
|
||
|
* fields from the document through the index's pipeline and then add it to
|
||
|
* the index, it will then show up in search results.
|
||
|
*
|
||
|
* An 'add' event is emitted with the document that has been added and the index
|
||
|
* the document has been added to. This event can be silenced by passing false
|
||
|
* as the second argument to add.
|
||
|
*
|
||
|
* @param {Object} doc The document to add to the index.
|
||
|
* @param {Boolean} emitEvent Whether or not to emit events, default true.
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.add = function (doc, emitEvent) {
|
||
|
var docTokens = {},
|
||
|
allDocumentTokens = new lunr.SortedSet,
|
||
|
docRef = doc[this._ref],
|
||
|
emitEvent = emitEvent === undefined ? true : emitEvent
|
||
|
|
||
|
this._fields.forEach(function (field) {
|
||
|
var fieldTokens = this.pipeline.run(this.tokenizerFn(doc[field.name]))
|
||
|
|
||
|
docTokens[field.name] = fieldTokens
|
||
|
|
||
|
for (var i = 0; i < fieldTokens.length; i++) {
|
||
|
var token = fieldTokens[i]
|
||
|
allDocumentTokens.add(token)
|
||
|
this.corpusTokens.add(token)
|
||
|
}
|
||
|
}, this)
|
||
|
|
||
|
this.documentStore.set(docRef, allDocumentTokens)
|
||
|
|
||
|
for (var i = 0; i < allDocumentTokens.length; i++) {
|
||
|
var token = allDocumentTokens.elements[i]
|
||
|
var tf = 0;
|
||
|
|
||
|
for (var j = 0; j < this._fields.length; j++){
|
||
|
var field = this._fields[j]
|
||
|
var fieldTokens = docTokens[field.name]
|
||
|
var fieldLength = fieldTokens.length
|
||
|
|
||
|
if (!fieldLength) continue
|
||
|
|
||
|
var tokenCount = 0
|
||
|
for (var k = 0; k < fieldLength; k++){
|
||
|
if (fieldTokens[k] === token){
|
||
|
tokenCount++
|
||
|
}
|
||
|
}
|
||
|
|
||
|
tf += (tokenCount / fieldLength * field.boost)
|
||
|
}
|
||
|
|
||
|
this.tokenStore.add(token, { ref: docRef, tf: tf })
|
||
|
};
|
||
|
|
||
|
if (emitEvent) this.eventEmitter.emit('add', doc, this)
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Removes a document from the index.
|
||
|
*
|
||
|
* To make sure documents no longer show up in search results they can be
|
||
|
* removed from the index using this method.
|
||
|
*
|
||
|
* The document passed only needs to have the same ref property value as the
|
||
|
* document that was added to the index, they could be completely different
|
||
|
* objects.
|
||
|
*
|
||
|
* A 'remove' event is emitted with the document that has been removed and the index
|
||
|
* the document has been removed from. This event can be silenced by passing false
|
||
|
* as the second argument to remove.
|
||
|
*
|
||
|
* @param {Object} doc The document to remove from the index.
|
||
|
* @param {Boolean} emitEvent Whether to emit remove events, defaults to true
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.remove = function (doc, emitEvent) {
|
||
|
var docRef = doc[this._ref],
|
||
|
emitEvent = emitEvent === undefined ? true : emitEvent
|
||
|
|
||
|
if (!this.documentStore.has(docRef)) return
|
||
|
|
||
|
var docTokens = this.documentStore.get(docRef)
|
||
|
|
||
|
this.documentStore.remove(docRef)
|
||
|
|
||
|
docTokens.forEach(function (token) {
|
||
|
this.tokenStore.remove(token, docRef)
|
||
|
}, this)
|
||
|
|
||
|
if (emitEvent) this.eventEmitter.emit('remove', doc, this)
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Updates a document in the index.
|
||
|
*
|
||
|
* When a document contained within the index gets updated, fields changed,
|
||
|
* added or removed, to make sure it correctly matched against search queries,
|
||
|
* it should be updated in the index.
|
||
|
*
|
||
|
* This method is just a wrapper around `remove` and `add`
|
||
|
*
|
||
|
* An 'update' event is emitted with the document that has been updated and the index.
|
||
|
* This event can be silenced by passing false as the second argument to update. Only
|
||
|
* an update event will be fired, the 'add' and 'remove' events of the underlying calls
|
||
|
* are silenced.
|
||
|
*
|
||
|
* @param {Object} doc The document to update in the index.
|
||
|
* @param {Boolean} emitEvent Whether to emit update events, defaults to true
|
||
|
* @see Index.prototype.remove
|
||
|
* @see Index.prototype.add
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.update = function (doc, emitEvent) {
|
||
|
var emitEvent = emitEvent === undefined ? true : emitEvent
|
||
|
|
||
|
this.remove(doc, false)
|
||
|
this.add(doc, false)
|
||
|
|
||
|
if (emitEvent) this.eventEmitter.emit('update', doc, this)
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Calculates the inverse document frequency for a token within the index.
|
||
|
*
|
||
|
* @param {String} token The token to calculate the idf of.
|
||
|
* @see Index.prototype.idf
|
||
|
* @private
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.idf = function (term) {
|
||
|
var cacheKey = "@" + term
|
||
|
if (Object.prototype.hasOwnProperty.call(this._idfCache, cacheKey)) return this._idfCache[cacheKey]
|
||
|
|
||
|
var documentFrequency = this.tokenStore.count(term),
|
||
|
idf = 1
|
||
|
|
||
|
if (documentFrequency > 0) {
|
||
|
idf = 1 + Math.log(this.documentStore.length / documentFrequency)
|
||
|
}
|
||
|
|
||
|
return this._idfCache[cacheKey] = idf
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Searches the index using the passed query.
|
||
|
*
|
||
|
* Queries should be a string, multiple words are allowed and will lead to an
|
||
|
* AND based query, e.g. `idx.search('foo bar')` will run a search for
|
||
|
* documents containing both 'foo' and 'bar'.
|
||
|
*
|
||
|
* All query tokens are passed through the same pipeline that document tokens
|
||
|
* are passed through, so any language processing involved will be run on every
|
||
|
* query term.
|
||
|
*
|
||
|
* Each query term is expanded, so that the term 'he' might be expanded to
|
||
|
* 'hello' and 'help' if those terms were already included in the index.
|
||
|
*
|
||
|
* Matching documents are returned as an array of objects, each object contains
|
||
|
* the matching document ref, as set for this index, and the similarity score
|
||
|
* for this document against the query.
|
||
|
*
|
||
|
* @param {String} query The query to search the index with.
|
||
|
* @returns {Object}
|
||
|
* @see Index.prototype.idf
|
||
|
* @see Index.prototype.documentVector
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.search = function (query) {
|
||
|
var queryTokens = this.pipeline.run(this.tokenizerFn(query)),
|
||
|
queryVector = new lunr.Vector,
|
||
|
documentSets = [],
|
||
|
fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0)
|
||
|
|
||
|
var hasSomeToken = queryTokens.some(function (token) {
|
||
|
return this.tokenStore.has(token)
|
||
|
}, this)
|
||
|
|
||
|
if (!hasSomeToken) return []
|
||
|
|
||
|
queryTokens
|
||
|
.forEach(function (token, i, tokens) {
|
||
|
var tf = 1 / tokens.length * this._fields.length * fieldBoosts,
|
||
|
self = this
|
||
|
|
||
|
var set = this.tokenStore.expand(token).reduce(function (memo, key) {
|
||
|
var pos = self.corpusTokens.indexOf(key),
|
||
|
idf = self.idf(key),
|
||
|
similarityBoost = 1,
|
||
|
set = new lunr.SortedSet
|
||
|
|
||
|
// if the expanded key is not an exact match to the token then
|
||
|
// penalise the score for this key by how different the key is
|
||
|
// to the token.
|
||
|
if (key !== token) {
|
||
|
var diff = Math.max(3, key.length - token.length)
|
||
|
similarityBoost = 1 / Math.log(diff)
|
||
|
}
|
||
|
|
||
|
// calculate the query tf-idf score for this token
|
||
|
// applying an similarityBoost to ensure exact matches
|
||
|
// these rank higher than expanded terms
|
||
|
if (pos > -1) queryVector.insert(pos, tf * idf * similarityBoost)
|
||
|
|
||
|
// add all the documents that have this key into a set
|
||
|
// ensuring that the type of key is preserved
|
||
|
var matchingDocuments = self.tokenStore.get(key),
|
||
|
refs = Object.keys(matchingDocuments),
|
||
|
refsLen = refs.length
|
||
|
|
||
|
for (var i = 0; i < refsLen; i++) {
|
||
|
set.add(matchingDocuments[refs[i]].ref)
|
||
|
}
|
||
|
|
||
|
return memo.union(set)
|
||
|
}, new lunr.SortedSet)
|
||
|
|
||
|
documentSets.push(set)
|
||
|
}, this)
|
||
|
|
||
|
var documentSet = documentSets.reduce(function (memo, set) {
|
||
|
return memo.intersect(set)
|
||
|
})
|
||
|
|
||
|
return documentSet
|
||
|
.map(function (ref) {
|
||
|
return { ref: ref, score: queryVector.similarity(this.documentVector(ref)) }
|
||
|
}, this)
|
||
|
.sort(function (a, b) {
|
||
|
return b.score - a.score
|
||
|
})
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Generates a vector containing all the tokens in the document matching the
|
||
|
* passed documentRef.
|
||
|
*
|
||
|
* The vector contains the tf-idf score for each token contained in the
|
||
|
* document with the passed documentRef. The vector will contain an element
|
||
|
* for every token in the indexes corpus, if the document does not contain that
|
||
|
* token the element will be 0.
|
||
|
*
|
||
|
* @param {Object} documentRef The ref to find the document with.
|
||
|
* @returns {lunr.Vector}
|
||
|
* @private
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.documentVector = function (documentRef) {
|
||
|
var documentTokens = this.documentStore.get(documentRef),
|
||
|
documentTokensLength = documentTokens.length,
|
||
|
documentVector = new lunr.Vector
|
||
|
|
||
|
for (var i = 0; i < documentTokensLength; i++) {
|
||
|
var token = documentTokens.elements[i],
|
||
|
tf = this.tokenStore.get(token)[documentRef].tf,
|
||
|
idf = this.idf(token)
|
||
|
|
||
|
documentVector.insert(this.corpusTokens.indexOf(token), tf * idf)
|
||
|
};
|
||
|
|
||
|
return documentVector
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Returns a representation of the index ready for serialisation.
|
||
|
*
|
||
|
* @returns {Object}
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.toJSON = function () {
|
||
|
return {
|
||
|
version: lunr.version,
|
||
|
fields: this._fields,
|
||
|
ref: this._ref,
|
||
|
tokenizer: this.tokenizerFn.label,
|
||
|
documentStore: this.documentStore.toJSON(),
|
||
|
tokenStore: this.tokenStore.toJSON(),
|
||
|
corpusTokens: this.corpusTokens.toJSON(),
|
||
|
pipeline: this.pipeline.toJSON()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/**
|
||
|
* Applies a plugin to the current index.
|
||
|
*
|
||
|
* A plugin is a function that is called with the index as its context.
|
||
|
* Plugins can be used to customise or extend the behaviour the index
|
||
|
* in some way. A plugin is just a function, that encapsulated the custom
|
||
|
* behaviour that should be applied to the index.
|
||
|
*
|
||
|
* The plugin function will be called with the index as its argument, additional
|
||
|
* arguments can also be passed when calling use. The function will be called
|
||
|
* with the index as its context.
|
||
|
*
|
||
|
* Example:
|
||
|
*
|
||
|
* var myPlugin = function (idx, arg1, arg2) {
|
||
|
* // `this` is the index to be extended
|
||
|
* // apply any extensions etc here.
|
||
|
* }
|
||
|
*
|
||
|
* var idx = lunr(function () {
|
||
|
* this.use(myPlugin, 'arg1', 'arg2')
|
||
|
* })
|
||
|
*
|
||
|
* @param {Function} plugin The plugin to apply.
|
||
|
* @memberOf Index
|
||
|
*/
|
||
|
lunr.Index.prototype.use = function (plugin) {
|
||
|
var args = Array.prototype.slice.call(arguments, 1)
|
||
|
args.unshift(this)
|
||
|
plugin.apply(this, args)
|
||
|
}
|