Skip to content

Commit

Permalink
fix: prevent suggesting word break characters (#1933)
Browse files Browse the repository at this point in the history
Discourage inserting `-` into words when making suggestions.
  • Loading branch information
Jason3S committed Nov 1, 2021
1 parent 425dc5d commit 42ffb98
Show file tree
Hide file tree
Showing 8 changed files with 223 additions and 11 deletions.
Binary file modified packages/Samples/dicts/nl_compound_trie3.trie.gz
Binary file not shown.
196 changes: 195 additions & 1 deletion packages/Samples/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 6 additions & 2 deletions packages/Samples/package.json
Expand Up @@ -4,9 +4,13 @@
"description": "Sample files used during unit tests",
"private": true,
"scripts": {
"test": "echo \"No test specified\""
"test": "echo \"No test specified\"",
"build-dutch-dictionary": "cspell-tools-cli compile --trie3 -x compound --merge nl_compound_trie3 -o ./dicts ./dicts/hunspell/Dutch.dic"
},
"keywords": [],
"author": "",
"license": "MIT"
"license": "MIT",
"devDependencies": {
"@cspell/cspell-tools": "^5.12.4"
}
}
5 changes: 4 additions & 1 deletion packages/cspell-tools/src/compiler/Reader.ts
Expand Up @@ -89,7 +89,7 @@ export async function readHunspellFiles(filename: string, options: ReaderOptions
return {
size: reader.dic.length,
annotatedWords() {
return reader.seqAffWords().pipe(_mapAffWords).pipe(normalizeAndDedupe);
return reader.seqAffWords().pipe(_mapAffWords, normalizeAndDedupe);
},
rawWords,
};
Expand Down Expand Up @@ -186,8 +186,11 @@ function* dedupeAndSort(words: Iterable<AnnotatedWord>): Iterable<AnnotatedWord>
}

function* _mapAffWords(affWords: Iterable<AffWord>): Generator<AnnotatedWord> {
const hasSpecial = /[~+!]/;
for (const affWord of affWords) {
const { word, flags } = affWord;
// For now do not include words with special characters.
if (hasSpecial.test(word)) continue;
const compound = flags.isCompoundForbidden ? '' : COMPOUND_FIX;
const forbid = flags.isForbiddenWord ? FORBID_PREFIX : '';
if (!forbid) {
Expand Down
2 changes: 1 addition & 1 deletion packages/cspell-trie-lib/src/lib/find.dutch.test.ts
Expand Up @@ -50,7 +50,7 @@ describe('Validate findWord', () => {
],

// Compounding enabled, but matching whole words (compounding not used).
['Code', { matchCase: true, compoundMode: 'compound' }, frCompoundFound(false)],
['Code', { matchCase: true, compoundMode: 'compound' }, frFound(false)],
['code', { matchCase: true, compoundMode: 'compound' }, frFound('code')],
['cafe', { matchCase: true, compoundMode: 'compound' }, frFound(false)],
['cafe', { matchCase: false, compoundMode: 'compound' }, frFound('cafe', { caseMatched: false })],
Expand Down
4 changes: 2 additions & 2 deletions packages/cspell-trie-lib/src/lib/suggest-nl.test.ts
Expand Up @@ -22,13 +22,13 @@ describe('Validate Dutch Suggestions', () => {
);

// cspell:ignore burtbewoners burgbewoners
// cspell:ignore buurtbwoners buurtbewoner
// cspell:ignore buurtbwoners buurtbewoner buurbewoners

test.each`
word | numSuggestions | expected
${'Mexico-Stad'} | ${2} | ${[sr('Mexico-Stad', 0), sr('mexico-stad', 2)]}
${'mexico-stad'} | ${2} | ${[sr('mexico-stad', 0), sr('Mexico-Stad', 2)]}
${'buurtbewoners'} | ${3} | ${[sr('buurtbewoners', 0), sr('buurtbewoners-', 86), sr('buurtbewoner', 88)]}
${'buurtbewoners'} | ${3} | ${[sr('buurtbewoners', 0), sr('buurtbewoner', 88), sr('buurbewoners', 96)]}
${'burtbewoners'} | ${2} | ${ac(sr('burgbewoners', 96), sr('buurtbewoners', 97))}
${'buurtbwoners'} | ${1} | ${[sr('buurtbewoners', 93)]}
${'buurtbewoners'} | ${1} | ${[sr('buurtbewoners', 0)]}
Expand Down
13 changes: 9 additions & 4 deletions packages/cspell-trie-lib/src/lib/suggest.ts
Expand Up @@ -11,6 +11,7 @@ const postSwapCost = swapCost - baseCost;
const insertSpaceCost = -1;
const mapSubCost = 1;
const maxCostScale = 0.5;
const discourageInsertCost = baseCost;

const setOfSeparators = new Set([JOIN_SEPARATOR, WORD_SEPARATOR]);

Expand Down Expand Up @@ -71,10 +72,14 @@ export function* genCompoundableSuggestions(
const stack: Range[] = [];
const x = ' ' + word;
const mx = x.length - 1;
const specialCosts: Record<string, number | undefined> = {
const specialInsCosts: Record<string, number | undefined> = Object.assign(Object.create(null), {
[WORD_SEPARATOR]: insertSpaceCost,
[JOIN_SEPARATOR]: insertSpaceCost,
};
});

const specialSubCosts: Record<string, number | undefined> = Object.assign(Object.create(null), {
'-': discourageInsertCost,
});

let stopNow = false;
let costLimit: MaxCost = bc * Math.min(word.length * maxCostScale, changeLimit);
Expand Down Expand Up @@ -143,8 +148,8 @@ export function* genCompoundableSuggestions(
}
const d = depth + 1;
const lastSugLetter = d > 1 ? text[d - 2] : '';
const c = bc - d;
const ci = c + (specialCosts[w] || 0);
const c = bc - d + (specialSubCosts[w] || 0);
const ci = c + (specialInsCosts[w] || 0);

// Setup first column
matrix[d] = matrix[d] || [];
Expand Down
6 changes: 6 additions & 0 deletions packages/hunspell-reader/src/IterableHunspellReader.ts
Expand Up @@ -128,6 +128,12 @@ export class IterableHunspellReader implements Iterable<string> {
return this.dicWordsSeq().map((w) => w.word);
}

/**
*
* @param affFile - path to aff file.
* @param dicFile - path to dic file.
* @returns IterableHunspellReader
*/
static async createFromFiles(affFile: string, dicFile: string) {
const aff = await parseAffFileToAff(affFile, defaultEncoding);
const buffer = await fs.readFile(dicFile);
Expand Down

0 comments on commit 42ffb98

Please sign in to comment.