Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
var lemmatizer = new Lemmatizer();
var antonyms = {}
//var data = fs.readFileSync("./antonyms.txt", 'utf8').split("\n")
/*_.each(data, function(value, key, list){
var value1 = value.split(",")
antonyms[value1[0]] = value1[1]
antonyms[value1[1]] = value1[0]
}, this)
*/
var old_unused_tokenizer = {tokenize: function(sentence) { return sentence.split(/[ \t,;:.!?]/).filter(function(a){return !!a}); }}
var tokenizer = new natural.RegexpTokenizer({pattern: /[^a-zA-Z0-9\-\?]+/});
console.vlog = function(data) { fs.appendFileSync(log_file, data + '\n', 'utf8') };
// var tokenizer = new natural.WordTokenizer({'pattern':(/(\W+|\%)/)}); // WordTokenizer, TreebankWordTokenizer, WordPunctTokenizer
// var ngrams = new natural.NGrams.ngrams()
// var enhance = function (classifierType, featureExtractor, inputSplitter, featureLookupTable, labelLookupTable, preProcessor, postProcessor, TestSplitLabel, multiplyFeaturesByIDF, featureExpansion, featureExpansionScale, featureExpansionPhrase, featureFine, expansionParam) {
var enhance = function (classifierType, featureExtractor, inputSplitter, featureLookupTable, labelLookupTable, preProcessor, postProcessor, TestSplitLabel, multiplyFeaturesByIDF, featureOptions) {
// var enhance = function (classifierType, featureLookupTable, labelLookupTable) {
return classifiers.EnhancedClassifier.bind(0, {
normalizer: normalizer,
inputSplitter: inputSplitter,
featureOptions:featureOptions,
// featureExpansion: featureExpansion,
if ("input" in sample)
sample = sample.input
/* if (!('basic-dependencies' in sample['sentences']))
throw new Error("train:"+train+" basic-dependencies not in the sample "+JSON.stringify(sample))
*/
/* if (!('sentences' in sample))
throw new Error("for some reason sentences not in sample "+JSON.stringify(sample))
*/
/* if (!('tokens' in sample['sentences']))
throw new Error("for some reason tokens not in sample"+JSON.stringify(sample, null, 4))
*/
if (_.isArray(sample['sentences']))
throw new Error("feAsync is only for object sentences")
var tokenizer = new natural.RegexpTokenizer({pattern: /[^\%a-zA-Z0-9\-\?]+/});
var text = regexpNormalizer(sample["text"].toLowerCase())
console.vlog("feAsyncStanford: text: "+text)
// the array of tokens
// var tokenized = tokenizer.tokenize(text)
// console.vlog("feAsyncStanford: tokenized: "+JSON.stringify(tokenized, null, 4))
// sample['sentences'] = {"tokens":[]}
// _.each(tokenized, function(value, key, list){
// sample['sentences']['tokens'].push({
// "word": value,
// // "lemma": value[0]
// "lemma": natural.PorterStemmer.stem(value)
// // "lemma": lemmerEng.lemmatize(value[0])
// })
function getRule(text)
{
/* if (!('tokens' in sen))
{
console.vlog("DEBUGRULE: for some reason tokens is not in the sentence " + JSON.stringify(sen, null, 4))
throw new Error("DEBUGRULE: for some reason tokens is not in the sentence " + JSON.stringify(sen, null, 4))
}
*/
// var sentence = JSON.parse(JSON.stringify(sen))
console.vlog("getRule: sentence: "+text)
// change tokens
var tokenizer = new natural.RegexpTokenizer({pattern: /[^\%a-zA-Z0-9\-\?]+/});
text = regexpNormalizer(text.toLowerCase())
var tkns = natural.NGrams.ngrams(tokenizer.tokenize(text), 1)
var sentence = {}
sentence['tokens'] = []
_.each(tkns, function(value, key, list){
sentence['tokens'].push({
"word": value[0],
// "lemma": value[0]
"lemma": (lemmatizer.only_lemmas(value[0]).length > 0 ? lemmatizer.only_lemmas(value[0])[0]: value[0])
})
}, this)
console.vlog("getRule: enrich lemma: "+JSON.stringify(sentence['tokens'], null, 4))
'use strict'
const natural = require('natural')
const TfIdf = natural.TfIdf
const tokenizer = new natural.RegexpTokenizer({
// pattern: new RegExp(/[^A-Za-zÅåÀÈÌÒÙàèìòùÁÉÍÓÚÝáéíóúýÂÊÎÔÛâêîôûÃÑÕãñõÄËÏÖÜŸäëïöüÿŠŽšžÇç]/i)
pattern: new RegExp(/[^a-zA-Z\u00C0-\u017F]/i)
})
class StandardAnalyzer {
constructor (fieldRules) {
this.fieldRules = fieldRules
this.tfidf = new TfIdf()
this.tfidf.setTokenizer(tokenizer)
}
add (field, value) {
if (Array.isArray(value)) {
let filteredValues = value.filter(this.isValid)
filteredValues.forEach(val => this.tfidf.addDocument(val, field))
} else if (this.isValid(value)) {
///TargetVerseDisplay.js//
const api = window.ModuleApi;
const React = api.React;
const ReactBootstrap = api.ReactBootstrap;
var natural = require('natural');
var XRegExp = require('xregexp');
var nonUnicodeLetter = XRegExp('\\PL');
//Wordlength tokenizer
const tokenizer = new natural.RegexpTokenizer({pattern: nonUnicodeLetter});
/* Contains a word from the target language, defines a lot of listeners for clicks */
const TargetWord = React.createClass({
// highlighted: false,
getInitialState: function() {
return {
highlighted: false,
wordObj: { // this is required to pass into our callbacks
'word': this.props.word,
'key': this.props.keyId
},
};
},
userClick: function() {
'REJECT':['no', 'not'],
'QUERY': ['how', 'about', 'let', 'discuss']
}
var wh = ["what", "which", "how"]
var sample = JSON.parse(JSON.stringify(sample_or))
if ('input' in sample)
sample = sample.input
console.vlog("DEBUGSALIENT: text : "+ sample.text)
//var attrval = getRule(sample.sentences, sample.text).labels
var attrval = getRule(sample.text).labels
var tokenizer = new natural.RegexpTokenizer({pattern: /[^a-zA-Z0-9\-\?]+/});
text = regexpNormalizer(sample.text.toLowerCase())
console.vlog("DEBUGSALIENT: normalized: "+text)
var tkns = _.flatten(natural.NGrams.ngrams(tokenizer.tokenize(text), 1))
console.vlog("DEBUGSALIENT: tokens: "+tkns)
var features_add = {}
_.each(salient, function(value, key, list){
var inter = _.intersection(value, tkns).length
if (inter != 0)
{
console.vlog("DEBUGSALIENT: GOT IT " +key+" "+inter)
features[key] = inter
}
var natural = require('natural');
var utils = require('./utils');
var async = require('async');
var bars = require('../../utils/bars.js');
var partitions = require('limdu/utils/partitions');
var PrecisionRecall = require("limdu/utils/PrecisionRecall");
var truth = require("../rule-based/truth_utils.js")
var truth_filename = "../../truth_teller/sentence_to_truthteller.txt"
var limdu = require("limdu");
var ftrs = limdu.features;
var rules = require("../rule-based/rules.js")
TfIdf = natural.TfIdf
tfidf = new TfIdf()
var tokenizer = new natural.RegexpTokenizer({pattern: /[^a-zA-Z0-9%'$+-]+/});
function cleanup(sentence)
{
console.log(sentence)
sentence = sentence.replace(//g, "")
sentence = sentence.replace(//g, "")
sentence = sentence.replace(/\^/g, "")
sentence = sentence.replace(/\./g, "")
sentence = sentence.replace(/\!/g, "")
sentence = sentence.replace(/\$/g, "")
sentence = sentence.replace(/ +(?= )/g,'')
sentence = sentence.toLowerCase()
console.log("\""+sentence+"\"")
if ((sentence == "") || (sentence == " "))
sentence = false
console.log(sentence)
function generatengrams(sentence)
{
var tokenizer = new natural.RegexpTokenizer({pattern: /[^a-zA-Z0-9%'$+-]+/});
var words = tokenizer.tokenize(sentence);
var feature = []
_(3).times(function(n){
feature = feature.concat(bars.skipgrams(words, n, 3))
})
var features = []
_.each(feature, function(value, key, list){
if (!bars.isstopword(value))
features.push(value.join(" "))
}, this)
features = _.unique(features)
features = _.sortBy(features, function(num){ return num.length })
export const getSentences = (textContent) => {
const tokenizer = new natural.RegexpTokenizer({pattern: /[!?.]/});
const pureContent = removeSpace(removePuncButPreserveSentences(textContent));
return tokenizer.tokenize(pureContent);
}
Search.prototype.tokenise = function(query) {
const tokeniser = new natural.RegexpTokenizer({
pattern: new RegExp(/[^a-zA-Z\u00C0-\u017F]/i)
})
return tokeniser.tokenize(query).map(word => {
return word.toLowerCase()
})
}