Skip to content

Commit fc49ad7

Browse files
kwahomeWahome Macharia
and
Wahome Macharia
authoredMar 27, 2023
new validator: isLocale, add support for validation of more valid language tags (#2189)
Co-authored-by: Wahome Macharia <wahome@Wahomes-MacBook-Pro.local>
1 parent 698f4e6 commit fc49ad7

File tree

2 files changed

+140
-5
lines changed

2 files changed

+140
-5
lines changed
 

‎src/lib/isLocale.js

+105-5
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,111 @@
11
import assertString from './util/assertString';
22

3-
const localeReg = /^[A-Za-z]{2,4}([_-]([A-Za-z]{4}|[\d]{3}))?([_-]([A-Za-z]{2}|[\d]{3}))?$/;
3+
/*
4+
= 3ALPHA ; selected ISO 639 codes
5+
*2("-" 3ALPHA) ; permanently reserved
6+
*/
7+
const extlang = '([A-Za-z]{3}(-[A-Za-z]{3}){0,2})';
8+
9+
/*
10+
= 2*3ALPHA ; shortest ISO 639 code
11+
["-" extlang] ; sometimes followed by
12+
; extended language subtags
13+
/ 4ALPHA ; or reserved for future use
14+
/ 5*8ALPHA ; or registered language subtag
15+
*/
16+
const language = `(([a-zA-Z]{2,3}(-${extlang})?)|([a-zA-Z]{5,8}))`;
17+
18+
/*
19+
= 4ALPHA ; ISO 15924 code
20+
*/
21+
const script = '([A-Za-z]{4})';
22+
23+
/*
24+
= 2ALPHA ; ISO 3166-1 code
25+
/ 3DIGIT ; UN M.49 code
26+
*/
27+
const region = '([A-Za-z]{2}|\\d{3})';
28+
29+
/*
30+
= 5*8alphanum ; registered variants
31+
/ (DIGIT 3alphanum)
32+
*/
33+
const variant = '([A-Za-z0-9]{5,8}|(\\d[A-Z-a-z0-9]{3}))';
34+
35+
/*
36+
= DIGIT ; 0 - 9
37+
/ %x41-57 ; A - W
38+
/ %x59-5A ; Y - Z
39+
/ %x61-77 ; a - w
40+
/ %x79-7A ; y - z
41+
*/
42+
const singleton = '(\\d|[A-W]|[Y-Z]|[a-w]|[y-z])';
43+
44+
/*
45+
= singleton 1*("-" (2*8alphanum))
46+
; Single alphanumerics
47+
; "x" reserved for private use
48+
*/
49+
const extension = `(${singleton}(-[A-Za-z0-9]{2,8})+)`;
50+
51+
/*
52+
= "x" 1*("-" (1*8alphanum))
53+
*/
54+
const privateuse = '(x(-[A-Za-z0-9]{1,8})+)';
55+
56+
// irregular tags do not match the 'langtag' production and would not
57+
// otherwise be considered 'well-formed'. These tags are all valid, but
58+
// most are deprecated in favor of more modern subtags or subtag combination
59+
60+
const irregular = '((en-GB-oed)|(i-ami)|(i-bnn)|(i-default)|(i-enochian)|' +
61+
'(i-hak)|(i-klingon)|(i-lux)|(i-mingo)|(i-navajo)|(i-pwn)|(i-tao)|' +
62+
'(i-tay)|(i-tsu)|(sgn-BE-FR)|(sgn-BE-NL)|(sgn-CH-DE))';
63+
64+
// regular tags match the 'langtag' production, but their subtags are not
65+
// extended language or variant subtags: their meaning is defined by
66+
// their registration and all of these are deprecated in favor of a more
67+
// modern subtag or sequence of subtags
68+
69+
const regular = '((art-lojban)|(cel-gaulish)|(no-bok)|(no-nyn)|(zh-guoyu)|' +
70+
'(zh-hakka)|(zh-min)|(zh-min-nan)|(zh-xiang))';
71+
72+
/*
73+
= irregular ; non-redundant tags registered
74+
/ regular ; during the RFC 3066 era
75+
76+
*/
77+
const grandfathered = `(${irregular}|${regular})`;
78+
79+
/*
80+
RFC 5646 defines delimitation of subtags via a hyphen:
81+
82+
"Subtag" refers to a specific section of a tag, delimited by a
83+
hyphen, such as the subtags 'zh', 'Hant', and 'CN' in the tag "zh-
84+
Hant-CN". Examples of subtags in this document are enclosed in
85+
single quotes ('Hant')
86+
87+
However, we need to add "_" to maintain the existing behaviour.
88+
*/
89+
const delimiter = '(-|_)';
90+
91+
/*
92+
= language
93+
["-" script]
94+
["-" region]
95+
*("-" variant)
96+
*("-" extension)
97+
["-" privateuse]
98+
*/
99+
const langtag = `${language}(${delimiter}${script})?(${delimiter}${region})?(${delimiter}${variant})*(${delimiter}${extension})*(${delimiter}${privateuse})?`;
100+
101+
/*
102+
Regex implementation based on BCP RFC 5646
103+
Tags for Identifying Languages
104+
https://www.rfc-editor.org/rfc/rfc5646.html
105+
*/
106+
const languageTagRegex = new RegExp(`(^${privateuse}$)|(^${grandfathered}$)|(^${langtag}$)`);
4107

5108
export default function isLocale(str) {
6109
assertString(str);
7-
if (str === 'en_US_POSIX' || str === 'ca_ES_VALENCIA') {
8-
return true;
9-
}
10-
return localeReg.test(str);
110+
return languageTagRegex.test(str);
11111
}

‎test/validators.test.js

+35
Original file line numberDiff line numberDiff line change
@@ -4816,16 +4816,51 @@ describe('Validators', () => {
48164816
'uz_Latn_UZ',
48174817
'en',
48184818
'gsw',
4819+
'en-US',
48194820
'es_ES',
4821+
'es-419',
48204822
'sw_KE',
48214823
'am_ET',
4824+
'zh-CHS',
48224825
'ca_ES_VALENCIA',
48234826
'en_US_POSIX',
4827+
'hak-CN',
4828+
'zh-Hant',
4829+
'zh-Hans',
4830+
'sr-Cyrl',
4831+
'sr-Latn',
4832+
'zh-cmn-Hans-CN',
4833+
'cmn-Hans-CN',
4834+
'zh-yue-HK',
4835+
'yue-HK',
4836+
'zh-Hans-CN',
4837+
'sr-Latn-RS',
4838+
'sl-rozaj',
4839+
'sl-rozaj-biske',
4840+
'sl-nedis',
4841+
'de-CH-1901',
4842+
'sl-IT-nedis',
4843+
'hy-Latn-IT-arevela',
4844+
'i-enochian',
4845+
'en-scotland-fonipa',
4846+
'sl-IT-rozaj-biske-1994',
4847+
'de-CH-x-phonebk',
4848+
'az-Arab-x-AZE-derbend',
4849+
'x-whatever',
4850+
'qaa-Qaaa-QM-x-southern',
4851+
'de-Qaaa',
4852+
'sr-Latn-QM',
4853+
'sr-Qaaa-RS',
4854+
'en-US-u-islamcal',
4855+
'zh-CN-a-myext-x-private',
4856+
'en-a-myext-b-another',
48244857
],
48254858
invalid: [
48264859
'lo_POP',
48274860
'12',
48284861
'12_DD',
4862+
'de-419-DE',
4863+
'a-DE',
48294864
],
48304865
});
48314866
});

0 commit comments

Comments
 (0)
Please sign in to comment.