/*! * lunr.Index * Copyright (C) @YEAR Oliver Nightingale */ /** * lunr.Index is object that manages a search index. It contains the indexes * and stores all the tokens and document lookups. It also provides the main * user facing API for the library. * * @constructor */ lunr.Index = function () { this._fields = [] this._ref = 'id' this.pipeline = new lunr.Pipeline this.documentStore = new lunr.Store this.tokenStore = new lunr.TokenStore this.corpusTokens = new lunr.SortedSet this.eventEmitter = new lunr.EventEmitter this.tokenizerFn = lunr.tokenizer this._idfCache = {} this.on('add', 'remove', 'update', (function () { this._idfCache = {} }).bind(this)) } /** * Bind a handler to events being emitted by the index. * * The handler can be bound to many events at the same time. * * @param {String} [eventName] The name(s) of events to bind the function to. * @param {Function} fn The serialised set to load. * @memberOf Index */ lunr.Index.prototype.on = function () { var args = Array.prototype.slice.call(arguments) return this.eventEmitter.addListener.apply(this.eventEmitter, args) } /** * Removes a handler from an event being emitted by the index. * * @param {String} eventName The name of events to remove the function from. * @param {Function} fn The serialised set to load. * @memberOf Index */ lunr.Index.prototype.off = function (name, fn) { return this.eventEmitter.removeListener(name, fn) } /** * Loads a previously serialised index. * * Issues a warning if the index being imported was serialised * by a different version of lunr. * * @param {Object} serialisedData The serialised set to load. * @returns {lunr.Index} * @memberOf Index */ lunr.Index.load = function (serialisedData) { if (serialisedData.version !== lunr.version) { lunr.utils.warn('version mismatch: current ' + lunr.version + ' importing ' + serialisedData.version) } var idx = new this idx._fields = serialisedData.fields idx._ref = serialisedData.ref idx.tokenizer = lunr.tokenizer.load(serialisedData.tokenizer) idx.documentStore = lunr.Store.load(serialisedData.documentStore) idx.tokenStore = lunr.TokenStore.load(serialisedData.tokenStore) idx.corpusTokens = lunr.SortedSet.load(serialisedData.corpusTokens) idx.pipeline = lunr.Pipeline.load(serialisedData.pipeline) return idx } /** * Adds a field to the list of fields that will be searchable within documents * in the index. * * An optional boost param can be passed to affect how much tokens in this field * rank in search results, by default the boost value is 1. * * Fields should be added before any documents are added to the index, fields * that are added after documents are added to the index will only apply to new * documents added to the index. * * @param {String} fieldName The name of the field within the document that * should be indexed * @param {Number} boost An optional boost that can be applied to terms in this * field. * @returns {lunr.Index} * @memberOf Index */ lunr.Index.prototype.field = function (fieldName, opts) { var opts = opts || {}, field = { name: fieldName, boost: opts.boost || 1 } this._fields.push(field) return this } /** * Sets the property used to uniquely identify documents added to the index, * by default this property is 'id'. * * This should only be changed before adding documents to the index, changing * the ref property without resetting the index can lead to unexpected results. * * The value of ref can be of any type but it _must_ be stably comparable and * orderable. * * @param {String} refName The property to use to uniquely identify the * documents in the index. * @param {Boolean} emitEvent Whether to emit add events, defaults to true * @returns {lunr.Index} * @memberOf Index */ lunr.Index.prototype.ref = function (refName) { this._ref = refName return this } /** * Sets the tokenizer used for this index. * * By default the index will use the default tokenizer, lunr.tokenizer. The tokenizer * should only be changed before adding documents to the index. Changing the tokenizer * without re-building the index can lead to unexpected results. * * @param {Function} fn The function to use as a tokenizer. * @returns {lunr.Index} * @memberOf Index */ lunr.Index.prototype.tokenizer = function (fn) { var isRegistered = fn.label && (fn.label in lunr.tokenizer.registeredFunctions) if (!isRegistered) { lunr.utils.warn('Function is not a registered tokenizer. This may cause problems when serialising the index') } this.tokenizerFn = fn return this } /** * Add a document to the index. * * This is the way new documents enter the index, this function will run the * fields from the document through the index's pipeline and then add it to * the index, it will then show up in search results. * * An 'add' event is emitted with the document that has been added and the index * the document has been added to. This event can be silenced by passing false * as the second argument to add. * * @param {Object} doc The document to add to the index. * @param {Boolean} emitEvent Whether or not to emit events, default true. * @memberOf Index */ lunr.Index.prototype.add = function (doc, emitEvent) { var docTokens = {}, allDocumentTokens = new lunr.SortedSet, docRef = doc[this._ref], emitEvent = emitEvent === undefined ? true : emitEvent this._fields.forEach(function (field) { var fieldTokens = this.pipeline.run(this.tokenizerFn(doc[field.name])) docTokens[field.name] = fieldTokens for (var i = 0; i < fieldTokens.length; i++) { var token = fieldTokens[i] allDocumentTokens.add(token) this.corpusTokens.add(token) } }, this) this.documentStore.set(docRef, allDocumentTokens) for (var i = 0; i < allDocumentTokens.length; i++) { var token = allDocumentTokens.elements[i] var tf = 0; for (var j = 0; j < this._fields.length; j++){ var field = this._fields[j] var fieldTokens = docTokens[field.name] var fieldLength = fieldTokens.length if (!fieldLength) continue var tokenCount = 0 for (var k = 0; k < fieldLength; k++){ if (fieldTokens[k] === token){ tokenCount++ } } tf += (tokenCount / fieldLength * field.boost) } this.tokenStore.add(token, { ref: docRef, tf: tf }) }; if (emitEvent) this.eventEmitter.emit('add', doc, this) } /** * Removes a document from the index. * * To make sure documents no longer show up in search results they can be * removed from the index using this method. * * The document passed only needs to have the same ref property value as the * document that was added to the index, they could be completely different * objects. * * A 'remove' event is emitted with the document that has been removed and the index * the document has been removed from. This event can be silenced by passing false * as the second argument to remove. * * @param {Object} doc The document to remove from the index. * @param {Boolean} emitEvent Whether to emit remove events, defaults to true * @memberOf Index */ lunr.Index.prototype.remove = function (doc, emitEvent) { var docRef = doc[this._ref], emitEvent = emitEvent === undefined ? true : emitEvent if (!this.documentStore.has(docRef)) return var docTokens = this.documentStore.get(docRef) this.documentStore.remove(docRef) docTokens.forEach(function (token) { this.tokenStore.remove(token, docRef) }, this) if (emitEvent) this.eventEmitter.emit('remove', doc, this) } /** * Updates a document in the index. * * When a document contained within the index gets updated, fields changed, * added or removed, to make sure it correctly matched against search queries, * it should be updated in the index. * * This method is just a wrapper around `remove` and `add` * * An 'update' event is emitted with the document that has been updated and the index. * This event can be silenced by passing false as the second argument to update. Only * an update event will be fired, the 'add' and 'remove' events of the underlying calls * are silenced. * * @param {Object} doc The document to update in the index. * @param {Boolean} emitEvent Whether to emit update events, defaults to true * @see Index.prototype.remove * @see Index.prototype.add * @memberOf Index */ lunr.Index.prototype.update = function (doc, emitEvent) { var emitEvent = emitEvent === undefined ? true : emitEvent this.remove(doc, false) this.add(doc, false) if (emitEvent) this.eventEmitter.emit('update', doc, this) } /** * Calculates the inverse document frequency for a token within the index. * * @param {String} token The token to calculate the idf of. * @see Index.prototype.idf * @private * @memberOf Index */ lunr.Index.prototype.idf = function (term) { var cacheKey = "@" + term if (Object.prototype.hasOwnProperty.call(this._idfCache, cacheKey)) return this._idfCache[cacheKey] var documentFrequency = this.tokenStore.count(term), idf = 1 if (documentFrequency > 0) { idf = 1 + Math.log(this.documentStore.length / documentFrequency) } return this._idfCache[cacheKey] = idf } /** * Searches the index using the passed query. * * Queries should be a string, multiple words are allowed and will lead to an * AND based query, e.g. `idx.search('foo bar')` will run a search for * documents containing both 'foo' and 'bar'. * * All query tokens are passed through the same pipeline that document tokens * are passed through, so any language processing involved will be run on every * query term. * * Each query term is expanded, so that the term 'he' might be expanded to * 'hello' and 'help' if those terms were already included in the index. * * Matching documents are returned as an array of objects, each object contains * the matching document ref, as set for this index, and the similarity score * for this document against the query. * * @param {String} query The query to search the index with. * @returns {Object} * @see Index.prototype.idf * @see Index.prototype.documentVector * @memberOf Index */ lunr.Index.prototype.search = function (query) { var queryTokens = this.pipeline.run(this.tokenizerFn(query)), queryVector = new lunr.Vector, documentSets = [], fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0) var hasSomeToken = queryTokens.some(function (token) { return this.tokenStore.has(token) }, this) if (!hasSomeToken) return [] queryTokens .forEach(function (token, i, tokens) { var tf = 1 / tokens.length * this._fields.length * fieldBoosts, self = this var set = this.tokenStore.expand(token).reduce(function (memo, key) { var pos = self.corpusTokens.indexOf(key), idf = self.idf(key), similarityBoost = 1, set = new lunr.SortedSet // if the expanded key is not an exact match to the token then // penalise the score for this key by how different the key is // to the token. if (key !== token) { var diff = Math.max(3, key.length - token.length) similarityBoost = 1 / Math.log(diff) } // calculate the query tf-idf score for this token // applying an similarityBoost to ensure exact matches // these rank higher than expanded terms if (pos > -1) queryVector.insert(pos, tf * idf * similarityBoost) // add all the documents that have this key into a set // ensuring that the type of key is preserved var matchingDocuments = self.tokenStore.get(key), refs = Object.keys(matchingDocuments), refsLen = refs.length for (var i = 0; i < refsLen; i++) { set.add(matchingDocuments[refs[i]].ref) } return memo.union(set) }, new lunr.SortedSet) documentSets.push(set) }, this) var documentSet = documentSets.reduce(function (memo, set) { return memo.intersect(set) }) return documentSet .map(function (ref) { return { ref: ref, score: queryVector.similarity(this.documentVector(ref)) } }, this) .sort(function (a, b) { return b.score - a.score }) } /** * Generates a vector containing all the tokens in the document matching the * passed documentRef. * * The vector contains the tf-idf score for each token contained in the * document with the passed documentRef. The vector will contain an element * for every token in the indexes corpus, if the document does not contain that * token the element will be 0. * * @param {Object} documentRef The ref to find the document with. * @returns {lunr.Vector} * @private * @memberOf Index */ lunr.Index.prototype.documentVector = function (documentRef) { var documentTokens = this.documentStore.get(documentRef), documentTokensLength = documentTokens.length, documentVector = new lunr.Vector for (var i = 0; i < documentTokensLength; i++) { var token = documentTokens.elements[i], tf = this.tokenStore.get(token)[documentRef].tf, idf = this.idf(token) documentVector.insert(this.corpusTokens.indexOf(token), tf * idf) }; return documentVector } /** * Returns a representation of the index ready for serialisation. * * @returns {Object} * @memberOf Index */ lunr.Index.prototype.toJSON = function () { return { version: lunr.version, fields: this._fields, ref: this._ref, tokenizer: this.tokenizerFn.label, documentStore: this.documentStore.toJSON(), tokenStore: this.tokenStore.toJSON(), corpusTokens: this.corpusTokens.toJSON(), pipeline: this.pipeline.toJSON() } } /** * Applies a plugin to the current index. * * A plugin is a function that is called with the index as its context. * Plugins can be used to customise or extend the behaviour the index * in some way. A plugin is just a function, that encapsulated the custom * behaviour that should be applied to the index. * * The plugin function will be called with the index as its argument, additional * arguments can also be passed when calling use. The function will be called * with the index as its context. * * Example: * * var myPlugin = function (idx, arg1, arg2) { * // `this` is the index to be extended * // apply any extensions etc here. * } * * var idx = lunr(function () { * this.use(myPlugin, 'arg1', 'arg2') * }) * * @param {Function} plugin The plugin to apply. * @memberOf Index */ lunr.Index.prototype.use = function (plugin) { var args = Array.prototype.slice.call(arguments, 1) args.unshift(this) plugin.apply(this, args) }