How to use stopword - 10 common examples

To help you get started, we’ve selected a few stopword examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github fergiemcdowall / search-index / test / node / mocha-tests / indexing-test-2.js View on Github external
/* global it */
/* global describe */

const Readable = require('stream').Readable
const logLevel = process.env.LOG_LEVEL || 'error'
const s = new Readable({ objectMode: true })
const sandboxPath = 'test/sandbox'
const should = require('should')
const stopwords = require('stopword').en

s.push({
  id: 'a',
  title: 'The Beatles',
  content: 'The Beatles were an English rock band, formed in Liverpool in 1960. Beatles from Liverpool',
  year: ['1960', '1961', '1962']
})
s.push({
  id: 'b',
  title: 'The Rolling Stones',
  content: 'The Rolling Stones are an English rock band formed in London in 1962.',
  year: ['1962', '1963', '1964']
})
s.push({
  id: 'c',
  title: 'Pink Floyd',
github kaleguy / leovue / src / components / WordCloud.vue View on Github external
const textItems = this.$store.state.leotext
        children.forEach(child => {
          const t = textItems[child.t]
          let textData = {}
          try {
            textData = JSON.parse(t)
          } catch (e) {
            console.log(e, child.id)
          }
          items.push(_.get(textData, this.from, ''))
        })
        let text = items.join()
        // text = stripchar.RSspecChar(text.toLowerCase())
        text = text.replace(/[[\]&,;'"”’().*?]/g, ' ')
        let words = split(text)
        words = sw.removeStopwords(words)
        const wf = {}
        _.remove(words, word => /\d/.test(word))
        words.forEach(word => {
          if (word.length < 4) { return }
          word = word.toLowerCase()
          wf[word] = wf[word] ? wf[word] + 1 : 1
        })
        // debugger
        const wordFreq = {}
        Object.keys(wf).forEach(k => {
          const v = wf[k]
          if (v > this.threshold) wordFreq[k] = v
        })
        const keys = Object.keys(wordFreq)
        keys.forEach(k => {
          if (wordFreq[k + 's']) {
github replicatedhq / hugo-algolia / lib / index.js View on Github external
.splice(2)
              .join("/")
              .replace(/\.[^/.]+$/, "");

        //Remove _index + index files from uri
        const compUriArray = item.uri.split("/");
        const lastItemInCompArray = compUriArray[compUriArray.length - 1];
        if (
          lastItemInCompArray.includes("index") ||
          lastItemInCompArray.includes("_index")
        ) {
          compUriArray.pop();
          item.uri = compUriArray.join("/");
        }

        let content = stopword
          .removeStopwords(meta.content.split(/\s+/))
          .join(" ")
          .replace(/\W/g, " ")
          .trim();
        let truncatedContent = truncate(content, _this.contentSize); // 20kB limit
        item.content = truncatedContent;

        // If this is a partial index, remove everything but the props we want
        if (self.partial) {
          item = _.pick(item, self.customInd);
        }
        
        // Include an objectID to prevent duplicated entries in the index.
        item.objectID = meta.data.objectID
          ? meta.data.objectID
          : item.uri
github withspectrum / spectrum / iris / migrations / 20171208223206-index-messages-for-search.js View on Github external
const withoutStopWords = str => {
  // turn the string into an array of words
  const arr = strToArray(str);
  // filter out any words that are considered stop words
  const cleaned = stopword.removeStopwords(arr);
  // join the array back into a string
  const joined = cleaned.join(' ');
  // return the string
  return joined;
};
github withspectrum / spectrum / vulcan / utils / text-parsing.js View on Github external
export const withoutStopWords = (str: string): string => {
  // turn the string into an array of words
  const arr = strToArray(str);
  // filter out any words that are considered stop words
  const cleaned = stopword.removeStopwords(arr);
  // join the array back into a string
  const joined = cleaned.join(' ');
  // return the string
  return joined;
};
github specfm / spec-next / servers / search / lib / utils / text-parsing.ts View on Github external
export const withoutStopWords = (str) => {
  // turn the string into an array of words
  const arr = strToArray(str);
  // filter out any words that are considered stop words
  const cleaned = stopword.removeStopwords(arr);
  // join the array back into a string
  const joined = cleaned.join(' ');
  // return the string
  return joined;
};
github WorldBrain / Memex / src / search / search-index.js View on Github external
import stopword from 'stopword'

import pipeline from './search-index-pipeline'
import { convertMetaDocId } from 'src/activity-logger'
import { RESULT_TYPES } from 'src/overview/constants'

const indexOpts = {
    batchSize: 500,
    appendOnly: false,
    indexPath: 'worldbrain-index',
    logLevel: 'info',
    preserveCase: false,
    compositeField: false,
    nGramLength: 1,
    // separator: /[|' .,\-|(\n)]+/,
    stopwords: stopword.en,
    fieldOptions: {
        // The `domain.tld(.cctld)` data from a page's URL
        // Currently used to afford `domain.tld(.cctld)` search in our queries
        // Should never need to tokenize, but put forward-slash separator incase preproecssing fails for whatever reason
        // (then domain search can still happen)
        domain: {
            weight: 40,
            fieldedSearch: true,
            separator: '/',
        },
        // Page title text; occasionally empty
        title: {
            weight: 30,
            fieldedSearch: true,
        },
        // Page URL tokenized by forward slashes; normalized slightly to remove protocol and leading `www.`
github machinelearnjs / machinelearnjs / src / lib / feature_extraction / text.ts View on Github external
private preprocess(text: string, { removeSW = false }): string[] {
    const tokenizer = new WordTokenizer();
    let tokens = text.split(' ');
    if (removeSW) {
      tokens = sw.removeStopwords(tokens, ENGLISH_STOP_WORDS);
    }
    return tokenizer.tokenize(tokens.join(' '));
  }
}
github fergiemcdowall / term-vector / lib / term-vector.js View on Github external
exports.getVector = function(text, options) {
  if (typeof text != "string")
    throw new Error("error: input must be a string");
  var defaults = {
    nGramLength: 1,
    separator: /[\|' \.,\-|(\n)]+/,
    stopwords: sw.getStopwords()
  }
  options = _.defaults(options || {}, defaults)
  if (options.nGramLength == 0)
    throw new Error("error: nGramLength must be greater than 0");
  //tokenise string, remove stopwords
  var tokens = sw.removeStopwords(text, {
    inputSeparator: options.separator,
    stopwords: options.stopwords
  }).split(' ');
  var vec = []
  if (!isNaN(options.nGramLength)) {
    return getTermVectorForNgramLength(tokens, options.nGramLength);
  }
  else if (options.nGramLength.constructor === Array) {
    for (var i = 0; i < options.nGramLength.length; i++)
      vec = vec.concat(getTermVectorForNgramLength(tokens, options.nGramLength[i]))
    return vec;
  }
  else if (typeof(options.nGramLength)
           && (parseInt(options.nGramLength.gte) <= parseInt(options.nGramLength.lte))) {
    var j = parseInt(options.nGramLength.gte);
    while (j <= options.nGramLength.lte) {
github fergiemcdowall / term-vector / lib / term-vector.js View on Github external
exports.getVector = function(text, options) {
  if (typeof text != "string")
    throw new Error("error: input must be a string");
  var defaults = {
    nGramLength: 1,
    separator: /[\|' \.,\-|(\n)]+/,
    stopwords: sw.getStopwords()
  }
  options = _.defaults(options || {}, defaults)
  if (options.nGramLength == 0)
    throw new Error("error: nGramLength must be greater than 0");
  //tokenise string, remove stopwords
  var tokens = sw.removeStopwords(text, {
    inputSeparator: options.separator,
    stopwords: options.stopwords
  }).split(' ');
  var vec = []
  if (!isNaN(options.nGramLength)) {
    return getTermVectorForNgramLength(tokens, options.nGramLength);
  }
  else if (options.nGramLength.constructor === Array) {
    for (var i = 0; i < options.nGramLength.length; i++)
      vec = vec.concat(getTermVectorForNgramLength(tokens, options.nGramLength[i]))

stopword

A module for node.js and the browser that takes in text and returns text that is stripped of stopwords. Has pre-defined stopword lists for 62 languages and also takes lists with custom stopwords as input.

MIT
Latest version published 4 months ago

Package Health Score

76 / 100
Full package analysis