From dd1fc2c59f9fe97a9057e65e47d266d14f51c735 Mon Sep 17 00:00:00 2001 From: Hugo-ter-Doest Date: Fri, 22 Mar 2024 19:18:27 +0100 Subject: [PATCH] Adding more ts specs --- lib/natural/tfidf/index.d.ts | 4 +- lib/natural/tfidf/tfidf.js | 2 +- lib/natural/trie/index.d.ts | 2 +- package.json | 2 +- spec/brill_pos_tagger_test.ts | 17 -- spec/tfidf_test.ts | 50 ---- spec/trie_test.ts | 23 -- {spec => ts_spec}/SentenceAnalyzer_test.ts | 0 {spec => ts_spec}/SentimentAnalyzer_test.ts | 0 {spec => ts_spec}/bayes_classifier_spec.ts | 0 {spec => ts_spec}/brill_pos_tagger_spec.ts | 6 +- {spec => ts_spec}/classifier_spec.ts | 0 {spec => ts_spec}/distance_test.ts | 0 {spec => ts_spec}/inflectors_test.ts | 0 .../logistic_regression_classifier_spec.ts | 0 {spec => ts_spec}/ngrams_test.ts | 0 {spec => ts_spec}/phonetics_test.ts | 0 {spec => ts_spec}/spellcheck_test.ts | 0 {spec => ts_spec}/stemmers_test.ts | 0 ts_spec/tfidf_spec.ts | 277 ++++++++++++++++++ {spec => ts_spec}/tokenizers_test.ts | 0 {spec => ts_spec}/transliterators_test.ts | 0 ts_spec/trie_spec.ts | 197 +++++++++++++ {spec => ts_spec}/util_test.ts | 0 {spec => ts_spec}/wordnet_test.ts | 0 tsconfig.json | 2 +- 26 files changed, 483 insertions(+), 99 deletions(-) delete mode 100644 spec/brill_pos_tagger_test.ts delete mode 100644 spec/tfidf_test.ts delete mode 100644 spec/trie_test.ts rename {spec => ts_spec}/SentenceAnalyzer_test.ts (100%) rename {spec => ts_spec}/SentimentAnalyzer_test.ts (100%) rename {spec => ts_spec}/bayes_classifier_spec.ts (100%) rename {spec => ts_spec}/brill_pos_tagger_spec.ts (88%) rename {spec => ts_spec}/classifier_spec.ts (100%) rename {spec => ts_spec}/distance_test.ts (100%) rename {spec => ts_spec}/inflectors_test.ts (100%) rename {spec => ts_spec}/logistic_regression_classifier_spec.ts (100%) rename {spec => ts_spec}/ngrams_test.ts (100%) rename {spec => ts_spec}/phonetics_test.ts (100%) rename {spec => ts_spec}/spellcheck_test.ts (100%) rename {spec => ts_spec}/stemmers_test.ts (100%) create mode 100644 ts_spec/tfidf_spec.ts rename {spec => ts_spec}/tokenizers_test.ts (100%) rename {spec => ts_spec}/transliterators_test.ts (100%) create mode 100644 ts_spec/trie_spec.ts rename {spec => ts_spec}/util_test.ts (100%) rename {spec => ts_spec}/wordnet_test.ts (100%) diff --git a/lib/natural/tfidf/index.d.ts b/lib/natural/tfidf/index.d.ts index e3a2fb61a..8e2272280 100644 --- a/lib/natural/tfidf/index.d.ts +++ b/lib/natural/tfidf/index.d.ts @@ -25,7 +25,7 @@ THE SOFTWARE. import type { Tokenizer } from '../tokenizers' -declare type TfIdfCallback = (i: number, measure: number, key?: string) => void +declare type TfIdfCallback = (i: number, measure: number, key?: string | Record) => void declare interface TfIdfTerm { term: string @@ -42,7 +42,7 @@ export class TfIdf { constructor (deserialized?: Record) idf (term: string, force?: boolean): number - addDocument (document: string | string[], key?: string, restoreCache?: boolean): void + addDocument (document: string | string[] | Record, key?: Record | any, restoreCache?: boolean): void addFileSync (path: string, encoding?: string, key?: string, restoreCache?: boolean): void tfidf (terms: string | string[], d: number): number tfidfs (terms: string | string[], callback?: TfIdfCallback): number[] diff --git a/lib/natural/tfidf/tfidf.js b/lib/natural/tfidf/tfidf.js index 2ee16e3bf..9d0978fc0 100644 --- a/lib/natural/tfidf/tfidf.js +++ b/lib/natural/tfidf/tfidf.js @@ -203,7 +203,7 @@ class TfIdf { setTokenizer (t) { if (!_.isFunction(t.tokenize)) { throw new Error('Expected a valid Tokenizer') } tokenizer = t - } + } // Define a stopwords other than the default setStopwords (customStopwords) { diff --git a/lib/natural/trie/index.d.ts b/lib/natural/trie/index.d.ts index cfda0a258..0dacdd962 100644 --- a/lib/natural/trie/index.d.ts +++ b/lib/natural/trie/index.d.ts @@ -27,7 +27,7 @@ export class Trie { addString (string: string): boolean addStrings (list: string[]): void contains (string: string): boolean - findPrefix (search: string): string[] + findPrefix (search: string): Array findMatchesOnPath (search: string): string[] keysWithPrefix (prefix: string): string[] getSize (): number diff --git a/package.json b/package.json index 9c1ac2ce2..2db843e32 100644 --- a/package.json +++ b/package.json @@ -77,7 +77,7 @@ "clean": "rimraf *~ #* *classifier.json dist io_spec/tmp/*.json", "test": "cross-env NODE_PATH=. jasmine --random=false spec/*_spec.js", "test_io": "jasmine --random=false io_spec/*_spec.js", - "test_ts": "cross-env NODE_PATH=. jasmine --random=false dist/spec/*_spec.js", + "test_ts": "cross-env NODE_PATH=. jasmine --random=false dist/ts_spec/*_spec.js", "coverage": "nyc --reporter=lcov npm run test && nyc npm run test_io", "test_browser": "cross-env NODE_PATH=. node ./node_modules/gulp/bin/gulp.js", "lint": "eslint . --ext .ts" diff --git a/spec/brill_pos_tagger_test.ts b/spec/brill_pos_tagger_test.ts deleted file mode 100644 index 0d7c1aa33..000000000 --- a/spec/brill_pos_tagger_test.ts +++ /dev/null @@ -1,17 +0,0 @@ -import { - Lexicon, - RuleSet, - BrillPOSTagger -} from '../lib/natural/brill_pos_tagger' - -const rulesFilename = './data/English/tr_from_posjs.json' -const lexiconFilename = './data/English/lexicon_from_posjs.json' -const defaultCategory = 'N' - -const lexicon = new Lexicon(lexiconFilename, defaultCategory) -const rules = new RuleSet(rulesFilename) -const tagger = new BrillPOSTagger(lexicon, rules) - -const sentence = ['I', 'see', 'the', 'man', 'with', 'the', 'telescope'] -const taggedSentence = tagger.tag(sentence) -console.log(taggedSentence.taggedWords) diff --git a/spec/tfidf_test.ts b/spec/tfidf_test.ts deleted file mode 100644 index 47b891623..000000000 --- a/spec/tfidf_test.ts +++ /dev/null @@ -1,50 +0,0 @@ -import { TfIdf } from '../lib/natural/tfidf' - -let tfidf = new TfIdf() - -tfidf.addDocument('this document is about node.') -tfidf.addDocument('this document is about ruby.') -tfidf.addDocument('this document is about ruby and node.') -tfidf.addDocument('this document is about node. it has node examples') - -console.log('node --------------------------------') -tfidf.tfidfs('node', function (i, measure) { - console.log('document #' + i.toString() + ' is ' + measure.toString()) -}) - -console.log('ruby --------------------------------') -tfidf.tfidfs('ruby', function (i, measure) { - console.log('document #' + i.toString() + ' is ' + measure.toString()) -}) -console.log(tfidf.tfidf('node', 0)) -console.log(tfidf.tfidf('node', 1)) -tfidf = new TfIdf() -// tfidf.addFileSync('data_files/one.txt') -// tfidf.addFileSync('data_files/two.txt') -tfidf.addDocument('this document is about node.') -tfidf.addDocument('this document is about ruby.') -tfidf.addDocument('this document is about ruby and node.') - -tfidf.tfidfs('node ruby', function (i, measure) { - console.log('document #' + i.toString() + ' is ' + measure.toString()) -}) -tfidf.addDocument(['document', 'about', 'node']) -tfidf.addDocument(['document', 'about', 'ruby']) -tfidf.addDocument(['document', 'about', 'ruby', 'node']) -tfidf.addDocument(['document', 'about', 'node', 'node', 'examples']) - -tfidf.tfidfs(['node', 'ruby'], function (i, measure) { - console.log('document #' + i.toString() + ' is ' + measure.toString()) -}) -tfidf.listTerms(0 /* document index */).forEach(function (item) { - console.log(item.term + ': ' + item.tfidf.toString()) -}) -tfidf = new TfIdf() -tfidf.addDocument('document one', 'un') -tfidf.addDocument('document Two', 'deux') -const s = JSON.stringify(tfidf) -// save "s" to disk, database or otherwise - -// assuming you pulled "s" back out of storage. -const obj: Record = JSON.parse(s) -tfidf = new TfIdf(obj) diff --git a/spec/trie_test.ts b/spec/trie_test.ts deleted file mode 100644 index afb9f69e7..000000000 --- a/spec/trie_test.ts +++ /dev/null @@ -1,23 +0,0 @@ -import { Trie } from '../lib/natural/trie' - -const trie = new Trie() - -// Add one string at a time -trie.addString('test') - -// Or add many strings -trie.addStrings(['string1', 'string2', 'string3']) -console.log(trie.contains('test')) // true -console.log(trie.contains('asdf')) // false -console.log(trie.findPrefix('tester')) // ['test', 'er'] -console.log(trie.findPrefix('string4')) // [null, '4'] -console.log(trie.findPrefix('string3')) // ['string3', ''] -trie.addString('tes') -trie.addString('est') -console.log(trie.findMatchesOnPath('tester')) // ['tes', 'test']; -console.log(trie.keysWithPrefix('string')) // ["string1", "string2", "string3"] -trie.contains('TEST') // false - -const ciTrie = new Trie(false) -ciTrie.addString('test') -ciTrie.contains('TEsT') // true diff --git a/spec/SentenceAnalyzer_test.ts b/ts_spec/SentenceAnalyzer_test.ts similarity index 100% rename from spec/SentenceAnalyzer_test.ts rename to ts_spec/SentenceAnalyzer_test.ts diff --git a/spec/SentimentAnalyzer_test.ts b/ts_spec/SentimentAnalyzer_test.ts similarity index 100% rename from spec/SentimentAnalyzer_test.ts rename to ts_spec/SentimentAnalyzer_test.ts diff --git a/spec/bayes_classifier_spec.ts b/ts_spec/bayes_classifier_spec.ts similarity index 100% rename from spec/bayes_classifier_spec.ts rename to ts_spec/bayes_classifier_spec.ts diff --git a/spec/brill_pos_tagger_spec.ts b/ts_spec/brill_pos_tagger_spec.ts similarity index 88% rename from spec/brill_pos_tagger_spec.ts rename to ts_spec/brill_pos_tagger_spec.ts index f31eb3994..879465f79 100644 --- a/spec/brill_pos_tagger_spec.ts +++ b/ts_spec/brill_pos_tagger_spec.ts @@ -21,11 +21,11 @@ along with this program. If not, see . import { WordTokenizer, Lexicon, RuleSet, BrillPOSTagger } from 'lib/natural' import type { Sentence, TagResults } from 'lib/natural' -import * as englishSentences from './test_data/NYT-20150205-picassos-granddaughter-plans-to-sell-art-worrying-the-market.json' +import * as englishSentences from '../spec/test_data/NYT-20150205-picassos-granddaughter-plans-to-sell-art-worrying-the-market.json' -import jsonData from './test_data/NYT-20150205-picassos-granddaughter-plans_expected_tag_results.json' +import jsonData from '../spec/test_data/NYT-20150205-picassos-granddaughter-plans_expected_tag_results.json' -import * as dutchSentences from './test_data/Volkskrant-20150205-Knot-geldpers-aanzetten-is-paardenmiddel-voor-half-procent-inflatie.json' +import * as dutchSentences from '../spec/test_data/Volkskrant-20150205-Knot-geldpers-aanzetten-is-paardenmiddel-voor-half-procent-inflatie.json' const englishTagResults: TagResults = jsonData as TagResults const DEBUG = false diff --git a/spec/classifier_spec.ts b/ts_spec/classifier_spec.ts similarity index 100% rename from spec/classifier_spec.ts rename to ts_spec/classifier_spec.ts diff --git a/spec/distance_test.ts b/ts_spec/distance_test.ts similarity index 100% rename from spec/distance_test.ts rename to ts_spec/distance_test.ts diff --git a/spec/inflectors_test.ts b/ts_spec/inflectors_test.ts similarity index 100% rename from spec/inflectors_test.ts rename to ts_spec/inflectors_test.ts diff --git a/spec/logistic_regression_classifier_spec.ts b/ts_spec/logistic_regression_classifier_spec.ts similarity index 100% rename from spec/logistic_regression_classifier_spec.ts rename to ts_spec/logistic_regression_classifier_spec.ts diff --git a/spec/ngrams_test.ts b/ts_spec/ngrams_test.ts similarity index 100% rename from spec/ngrams_test.ts rename to ts_spec/ngrams_test.ts diff --git a/spec/phonetics_test.ts b/ts_spec/phonetics_test.ts similarity index 100% rename from spec/phonetics_test.ts rename to ts_spec/phonetics_test.ts diff --git a/spec/spellcheck_test.ts b/ts_spec/spellcheck_test.ts similarity index 100% rename from spec/spellcheck_test.ts rename to ts_spec/spellcheck_test.ts diff --git a/spec/stemmers_test.ts b/ts_spec/stemmers_test.ts similarity index 100% rename from spec/stemmers_test.ts rename to ts_spec/stemmers_test.ts diff --git a/ts_spec/tfidf_spec.ts b/ts_spec/tfidf_spec.ts new file mode 100644 index 000000000..461584f27 --- /dev/null +++ b/ts_spec/tfidf_spec.ts @@ -0,0 +1,277 @@ +/* +Copyright (c) 2011, Rob Ellis, Chris Umbel + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +'use strict' + +import { TfIdf, TreebankWordTokenizer, stopwords } from 'lib/natural' +import type { TfIdfTerm } from 'lib/natural' + +let tfidf: TfIdf + +describe('tfidf', function () { + describe('stateless operations', function () { + it('should tf', function () { + expect(TfIdf.tf('document', { document: 2, one: 1 })).toBe(2) + expect(TfIdf.tf('document', { greetings: 1, program: 1 })).toBe(0) + expect(TfIdf.tf('program', { greetings: 1, program: 1 })).toBe(1) + }) + }) + + describe('keys', function () { + it('should store and recall keys', function () { + tfidf = new TfIdf() + tfidf.addDocument('document one', 'un') + tfidf.addDocument('document Two', 'deux') + + tfidf.tfidfs('two', function (i, tfidf, key) { + if (i === 0) { expect(key).toBe('un') } else { expect(key).toBe('deux') } + }) + }) + + it('should handle a deserialized object passed to the constructor', function () { + tfidf = new TfIdf({ + documents: [ + { __key: 'un', document: 1, one: 1 }, + { __key: 'deux', document: 1, two: 1 } + ] + }) + tfidf.tfidfs('two', function (i, tfidf, key) { + if (i === 1) { expect(key).toBe('deux') } else { expect(key).toBe('un') } + }) + }) + + it('should work when called without a callback', function () { + tfidf = new TfIdf({ + documents: [ + { __key: 'un', document: 1, one: 1 }, + { __key: 'deux', document: 1, two: 1 } + ] + }) + const tfidfs = tfidf.tfidfs('two') + expect(tfidfs[1]).toBe(1 + Math.log(2.0 / 2.0)) + }) + + it('should work with the restoreCache flag set to true', function () { + tfidf = new TfIdf() + tfidf.addDocument('document one', 'un') + expect(tfidf.idf('one')).toBe(1 + Math.log(1.0 / 2.0)) + tfidf.addDocument('document Two', 'deux', true) + + tfidf.tfidfs('two', function (i, tfidf, key) { + if (i === 0) { expect(key).toBe('un') } else { expect(key).toBe('deux') } + }) + }) + }) + + describe('stateful operations', function () { + beforeEach(function () { + tfidf = new TfIdf() + tfidf.addDocument('document one') + tfidf.addDocument('document Two') + }) + + it('should list important terms', function () { + const terms: TfIdfTerm[] = tfidf.listTerms(0) + expect(terms[0].tfidf).toBeGreaterThan(terms[1].tfidf) + }) + }) + + describe('special cases', function () { + // In response to + it('should handle reserved function names correctly in documents', function () { + const reservedWords = [ + 'toString', + 'toLocaleString', + 'valueOf', + 'hasOwnProperty', + 'isPrototypeOf', + 'propertyIsEnumerable', + 'constructor' + ] + tfidf = new TfIdf() + tfidf.addDocument(reservedWords.join(' ')) + + reservedWords.forEach((word: string) => { + expect(tfidf.tfidf(word, 0)).toBe(1 * (1 + Math.log(1 / 2))) + }) + }) + + it('should handle an array passed to tfidf()', function () { + tfidf = new TfIdf() + const terms = ['this', 'document', 'is', 'about', 'poetry'] + tfidf.addDocument(terms.join(' ')) + expect(tfidf.tfidf(terms, 0)).toBe(2 * (1 + Math.log(1.0 / 2.0))) + }) + }) + + describe('correct calculations', function () { + it('should compute idf correctly', function () { + tfidf = new TfIdf() + tfidf.addDocument('this document is about node.') + tfidf.addDocument('this document is about ruby.') + tfidf.addDocument('this document is about ruby and node.') + tfidf.addDocument('this document is about node. it has node examples') + + expect(tfidf.idf('node')).toBe(1 + Math.log(4.0 / 4.0)) + }) + + it('should compute idf correctly with non-string documents', function () { + tfidf = new TfIdf() + tfidf.addDocument('this document is about node.') + tfidf.addDocument('this document is about ruby.') + tfidf.addDocument('this document is about ruby and node.') + tfidf.addDocument('this document is about node. it has node examples') + tfidf.addDocument({ text: 'this document is about python' }) + tfidf.addDocument(['this', 'document', 'is', 'about', 'node', 'and', 'JavaScript']) + + expect(tfidf.idf('node')).toBe(1 + Math.log(6.0 / 5.0)) + }) + + it('should compute tf correctly', function () { + expect(TfIdf.tf('node', { this: 1, document: 1, is: 1, about: 1, node: 1 })).toBe(1) + expect(TfIdf.tf('node', { this: 1, document: 1, is: 1, about: 1, ruby: 1 })).toBe(0) + expect(TfIdf.tf('node', { this: 1, document: 1, is: 1, about: 1, ruby: 1, and: 1, node: 1 })).toBe(1) + expect(TfIdf.tf('node', { this: 1, document: 1, is: 1, about: 1, node: 2, it: 1, has: 1, examples: 1 })).toBe(2) + }) + + // This is a test of the use case outlined in the readme. + it('should compute tf-idf correctly', function () { + const correctCalculations: number[] = [ + 1 * (1 + Math.log(4.0 / 4.0)), + 0, + 2 * (1 + Math.log(4.0 / 4.0)), + 1 * (1 + Math.log(4.0 / 3.0)) + ] + + tfidf = new TfIdf() + tfidf.addDocument('this document is about node.', { node: 0, ruby: 1 }) + tfidf.addDocument('this document is about ruby.', { node: 1, ruby: 3 }) + tfidf.addDocument('this document is about ruby and node.', { node: 0, ruby: 3 }) + tfidf.addDocument('this document is about node. it has node examples', { node: 2, ruby: 1 }) + + tfidf.tfidfs('node', (i: number, measure: number, k: string | Record | undefined): void => { + if (typeof k === 'object') { + expect(measure).toBe(correctCalculations[k.node]) + } + else { + console.log('Key was undefined') + } + }) + + tfidf.tfidfs('ruby', (i: number, measure: number, k: string | Record | undefined): void => { + if (typeof k === 'object') { + expect(measure).toBe(correctCalculations[k.ruby]) + } + else { + console.log('Key was undefined') + } + }) + }) + + it('should not return NaN if a term is not present in any documents', function () { + tfidf = new TfIdf() + tfidf.addDocument('this document is about node.') + + expect(tfidf.tfidf('ruby', 0)).toBe(0) + }) + + // This test assures that tf-idf is computed correctly before and after a document is added + // Computes and tests a few tf-idfs, then adds a document and ensures that those terms tf-idf value + // is updated accordingly. + it('should update a terms tf-idf score after adding documents', function () { + tfidf = new TfIdf() + + // Add 2 documents + tfidf.addDocument('this document is about node.', 0) + tfidf.addDocument('this document is about ruby.', 1) + + // check the tf-idf for 'node' + expect(tfidf.tfidf('node', 0)).toBe(1 * (1 + Math.log(2.0 / 2.0))) + + // Add 2 more documents + tfidf.addDocument('this document is about ruby and node.') + tfidf.addDocument('this document is about node. it has node examples') + + // Ensure that the tf-idf in the same document has changed to reflect the new idf. + expect(tfidf.tfidf('node', 0)).toBe(1 * (1 + Math.log(4.0 / 4.0))) + }) + + // Test idf.setTokenizer + it('should allow for specific types of Tokenizers', function () { + tfidf = new TfIdf() + + tfidf.addDocument('this document isn\'t about node.', 0) + tfidf.addDocument('that doc is about node.', 1) + expect(tfidf.tfidf('n\'t', 0)).toBe(0) + expect(tfidf.tfidf('isn', 0)).toBe(1 * (1 + Math.log(2 / 2))) + + tfidf = new TfIdf() + + tfidf.addDocument('this document isn\'t about node.', 0) + tfidf.addDocument('this document isn\'t about node.', 1) + + expect(tfidf.tfidf('isn', 0)).toBe(1 * (1 + Math.log(2 / 3))) + + tfidf = new TfIdf() + const tokenizer = new TreebankWordTokenizer() + + tfidf.addDocument('this document isn\'t about node.', 0) + tfidf.setTokenizer(tokenizer) + tfidf.addDocument('this doc isn\'t about node.', 1) + + expect(tfidf.tfidf('isn', 0)).toBe(1 * (1 + Math.log(2 / 2))) + expect(tfidf.tfidf('n\'t', 1)).toBe(1 * (1 + Math.log(2 / 2))) + expect(tfidf.tfidf('isn', 1)).toBe(0) + }) + + it('should require a valid tokenizer when using setTokenizer', function () { + tfidf = new TfIdf() + + // @ts-expect-error + expect(function () { tfidf.setTokenizer(1) }).toThrow(new Error('Expected a valid Tokenizer')) + // @ts-expect-error + expect(function () { tfidf.setTokenizer({}) }).toThrow(new Error('Expected a valid Tokenizer')) + }) + }) + + describe('Stopwords', function () { + it('should load a custom set of stopwords', function () { + tfidf = new TfIdf() + expect(tfidf.setStopwords(stopwords)).toEqual(true) + tfidf.addDocument('this document is about node.', 0) + const terms = tfidf.listTerms(0) + const tokens = terms.map(t => t.term) + expect(tokens.indexOf('about')).toEqual(-1) + expect(tokens.indexOf('this')).toEqual(-1) + }) + it('should detect an incorrect stopwords list (not an array)', function () { + const stopwords = {} + // @ts-expect-error + expect(tfidf.setStopwords(stopwords)).toEqual(false) + }) + it('should detect an incorrect stopwords list (one of the elements is not a string)', function () { + const stopwords = [function f () {}] + // @ts-expect-error + expect(tfidf.setStopwords(stopwords)).toEqual(false) + }) + }) +}) diff --git a/spec/tokenizers_test.ts b/ts_spec/tokenizers_test.ts similarity index 100% rename from spec/tokenizers_test.ts rename to ts_spec/tokenizers_test.ts diff --git a/spec/transliterators_test.ts b/ts_spec/transliterators_test.ts similarity index 100% rename from spec/transliterators_test.ts rename to ts_spec/transliterators_test.ts diff --git a/ts_spec/trie_spec.ts b/ts_spec/trie_spec.ts new file mode 100644 index 000000000..a3e4825eb --- /dev/null +++ b/ts_spec/trie_spec.ts @@ -0,0 +1,197 @@ +/* +Copyright (c) 2014 Ken Koch + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +'use strict' + +import { Trie } from 'lib/natural' + +describe('trie', function () { + describe('adding strings', function () { + it('should add words one at a time', function () { + const trie = new Trie() + trie.addString('test') + expect(trie.contains('test')).toBe(true) + + trie.addString('abcd') + expect(trie.contains('abcd')).toBe(true) + }) + + it('should return true if a string is added that already existed', function () { + const trie = new Trie() + expect(trie.addString('test')).toBe(false) + expect(trie.addString('test')).toBe(true) + }) + + it('Should add an array of strings', function () { + const trie = new Trie() + const testWords = ['test', 'abcd', 'ffff'] + trie.addStrings(testWords) + + for (let i = testWords.length - 1; i >= 0; i--) { + expect(trie.contains(testWords[i])).toBe(true) + }; + }) + }) + + describe('getSize', function () { + it('should return 1 for an empty trie', function () { + const trie = new Trie() + expect(trie.getSize()).toBe(1) + }) + + it('should return the correct size', function () { + const trie = new Trie() + trie.addString('a') + expect(trie.getSize()).toBe(2) + + trie.addString('ab') + expect(trie.getSize()).toBe(3) + }) + + it('should count all branches', function () { + const trie = new Trie() + trie.addString('a') + expect(trie.getSize()).toBe(2) + + trie.addString('ba') + expect(trie.getSize()).toBe(4) + }) + }) + + describe('searching', function () { + it("should not find words that haven't been added", function () { + const trie = new Trie() + expect(trie.contains('aaaaa')).toBe(false) + }) + + it('should not return prefixes of words that have been added as words', function () { + const trie = new Trie() + trie.addString('test') + expect(trie.contains('test')).toBe(true) + expect(trie.contains('tes')).toBe(false) + }) + + it('should not return suffixes of words that have been added as words', function () { + const trie = new Trie() + trie.addString('test') + expect(trie.contains('test')).toBe(true) + expect(trie.contains('est')).toBe(false) + }) + + it('should not find a word that falls in between two other words but has not been added', function () { + const trie = new Trie() + trie.addString('test') + trie.addString('tested') + expect(trie.contains('test')).toBe(true) + expect(trie.contains('tested')).toBe(true) + expect(trie.contains('teste')).toBe(false) + }) + }) + + function expectResults (results: string[]): void { + expect(results).toContain('a') + expect(results).toContain('ab') + expect(results).toContain('abc') + expect(results).not.toContain('bc') + expect(results).not.toContain('cd') + } + + describe('prefix searching', function () { + it('should be able to find all full prefix matched words along a path.', function () { + const trie = new Trie() + trie.addStrings(['a', 'ab', 'bc', 'cd', 'abc']) + const results = trie.findMatchesOnPath('abcd') + expectResults(results) + }) + + it('should be able to guess all full prefix matched words.', function () { + const trie = new Trie() + trie.addStrings(['a', 'ab', 'bc', 'cd', 'abc']) + let results = trie.keysWithPrefix('a') + expectResults(results) + results = trie.keysWithPrefix('ab') + expect(results).toContain('ab') + expect(results).toContain('abc') + expect(results).not.toContain('a') + expect(results).not.toContain('bc') + expect(results).not.toContain('cd') + }) + + it('should be able to execute the find_prefix search with a match', function () { + const trie = new Trie() + trie.addStrings(['their', 'and', 'they']) + const results = trie.findPrefix('theyre') + expect(results[0]).toBe('they') + expect(results[1]).toBe('re') + }) + + it('should return empty array if no full prefix matches found.', function () { + const trie = new Trie() + trie.addStrings(['a', 'ab', 'bc', 'cd', 'abc']) + const results = trie.keysWithPrefix('not-found') + expect(results.length).toEqual(0) + }) + + it('should be able to execute the find_prefix search without a match', function () { + const trie = new Trie() + trie.addStrings(['their', 'and']) + const results = trie.findPrefix('theyre') + expect(results[0]).toBe(null) + expect(results[1]).toBe('yre') + }) + }) + + describe('case sensitivity', function () { + it('should be case sensitive by default', function () { + const trie = new Trie() + trie.addString('test') + expect(trie.contains('TEST')).toBe(false) + }) + + it('should have contains in case-insensitive mode', function () { + const trie = new Trie(false) + trie.addString('test') + expect(trie.contains('TEST')).toBe(true) + }) + + it('should have case-insensitive contains when the strings are added with case', function () { + const trie = new Trie(false) + trie.addString('teSt') + expect(trie.contains('test')).toBe(true) + }) + + it('should have findMatchesOnPath in case-insensitive mode', function () { + const trie = new Trie(false) + trie.addStrings(['a', 'ab', 'bc', 'cd', 'abc']) + const results = trie.findMatchesOnPath('ABcD') + expectResults(results) + }) + + it('should have findPrefix in case-insensitive mode', function () { + const trie = new Trie(false) + trie.addStrings(['thEIr', 'And', 'theY']) + const results = trie.findPrefix('ThEyRe') + expect(results[0]).toBe('they') + expect(results[1]).toBe('re') + }) + }) +}) diff --git a/spec/util_test.ts b/ts_spec/util_test.ts similarity index 100% rename from spec/util_test.ts rename to ts_spec/util_test.ts diff --git a/spec/wordnet_test.ts b/ts_spec/wordnet_test.ts similarity index 100% rename from spec/wordnet_test.ts rename to ts_spec/wordnet_test.ts diff --git a/tsconfig.json b/tsconfig.json index 579918f35..dbd953453 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -17,7 +17,7 @@ }, "include": [ "lib/natural/**/*.ts", - "spec/*.ts", + "ts_spec/*.ts", "io_spec/*.ts" ], "exclude": [