1
//! <https://www.rfc-editor.org/rfc/rfc5646.txt>
2

            
3
use crate::common::LanguageTag;
4
use crate::parser::Error;
5
use nom::branch::alt;
6
use nom::bytes::streaming::{tag, take_while_m_n};
7
use nom::character::streaming::char;
8
use nom::combinator::{opt, peek, recognize, verify};
9
use nom::error::ParseError;
10
use nom::multi::{many0, many1, many_m_n};
11
use nom::{AsChar, IResult, Parser};
12

            
13
#[inline]
14
216
const fn is_singleton(b: u8) -> bool {
15
216
    matches!(b, b'\x30'..=b'\x39' | b'\x41'..=b'\x57' | b'\x59'..=b'\x5A' | b'\x61'..=b'\x77' | b'\x79'..=b'\x7A')
16
216
}
17

            
18
220
fn private_use<'a, E>(input: &'a [u8]) -> IResult<&'a [u8], &'a [u8], E>
19
220
where
20
220
    E: ParseError<&'a [u8]> + From<Error<'a>>,
21
220
{
22
220
    recognize((
23
220
        char('x'),
24
220
        many1((char('-'), take_while_m_n(1, 8, AsChar::is_alphanum))),
25
220
    ))
26
220
    .parse(input)
27
220
}
28

            
29
146
pub fn language_tag<'a, E>(input: &'a [u8]) -> IResult<&'a [u8], LanguageTag, E>
30
146
where
31
146
    E: ParseError<&'a [u8]> + From<Error<'a>>,
32
146
{
33
146
    let (input, grandfathered_irregular) = opt(alt((
34
146
        tag("en-GB-oed"),
35
146
        tag("i-ami"),
36
146
        tag("i-bnn"),
37
146
        tag("i-default"),
38
146
        tag("i-enochian"),
39
146
        tag("i-hak"),
40
146
        tag("i-klingon"),
41
146
        tag("i-lux"),
42
146
        tag("i-mingo"),
43
146
        tag("i-navajo"),
44
146
        tag("i-pwn"),
45
146
        tag("i-tao"),
46
146
        tag("i-tay"),
47
146
        tag("i-tsu"),
48
146
        tag("sgn-BE-FR"),
49
146
        tag("sgn-BE-NL"),
50
146
        tag("sgn-CH-DE"),
51
146
    )))
52
146
    .parse(input)?;
53

            
54
146
    if let Some(grandfathered_irregular) = grandfathered_irregular {
55
2
        let language_tag = LanguageTag {
56
2
            language: String::from_utf8_lossy(grandfathered_irregular).to_string(),
57
2
            ext_lang: None,
58
2
            script: None,
59
2
            region: None,
60
2
            variants: Vec::with_capacity(0),
61
2
            extensions: Vec::with_capacity(0),
62
2
            private_use: None,
63
2
        };
64
2

            
65
2
        return Ok((input, language_tag));
66
144
    }
67

            
68
144
    let (input, private_use) = opt(private_use).parse(input)?;
69
144
    if let Some(private_use) = private_use {
70
2
        let language_tag = LanguageTag {
71
2
            language: String::from_utf8_lossy(private_use).to_string(),
72
2
            ext_lang: None,
73
2
            script: None,
74
2
            region: None,
75
2
            variants: Vec::with_capacity(0),
76
2
            extensions: Vec::with_capacity(0),
77
2
            private_use: None,
78
2
        };
79
2

            
80
2
        return Ok((input, language_tag));
81
142
    }
82
142

            
83
142
    lang_tag(input)
84
146
}
85

            
86
/// Peeks at the next byte and checks that
87
///   - If the next byte is a `-`, then accept
88
///   - If the next byte is alphanumeric, then reject
89
///
90
/// This can be used to prevent bad matches that end in the middle of a component.
91
390
fn clip<'a, E>(input: &'a [u8]) -> IResult<&'a [u8], &'a [u8], E>
92
390
where
93
390
    E: ParseError<&'a [u8]> + From<Error<'a>>,
94
390
{
95
390
    peek(verify(
96
390
        take_while_m_n(0, 1, |c| c == b'-' || AsChar::is_alphanum(c)),
97
390
        |m: &[u8]| m == [b'-'] || m.is_empty(),
98
390
    ))
99
390
    .parse(input)
100
390
}
101

            
102
142
pub fn lang_tag<'a, E>(input: &'a [u8]) -> IResult<&'a [u8], LanguageTag, E>
103
142
where
104
142
    E: ParseError<&'a [u8]> + From<Error<'a>>,
105
142
{
106
142
    let (input, (language, ext_lang)) = alt((
107
142
        (
108
142
            take_while_m_n(2, 3, AsChar::is_alpha),
109
142
            opt((
110
142
                char('-'),
111
142
                recognize((
112
142
                    take_while_m_n(3, 3, AsChar::is_alpha),
113
142
                    many_m_n(
114
142
                        0,
115
142
                        2,
116
142
                        (char('-'), take_while_m_n(3, 3, AsChar::is_alpha), clip),
117
142
                    ),
118
142
                    clip,
119
142
                )),
120
142
            )),
121
142
        ),
122
142
        take_while_m_n(4, 4, AsChar::is_alpha).map(|l| (l, None)),
123
142
        take_while_m_n(5, 8, AsChar::is_alpha).map(|l| (l, None)),
124
142
    ))
125
142
    .parse(input)?;
126

            
127
140
    let mut language_tag = LanguageTag {
128
140
        language: String::from_utf8_lossy(language).to_string(),
129
140
        ext_lang: ext_lang.map(|(_, ext_lang)| String::from_utf8_lossy(ext_lang).to_string()),
130
140
        script: None,
131
140
        region: None,
132
140
        variants: Vec::with_capacity(0),
133
140
        extensions: Vec::with_capacity(0),
134
140
        private_use: None,
135
140
    };
136

            
137
    // Find the script, if present
138
140
    let (input, script) =
139
140
        opt((char('-'), take_while_m_n(4, 4, AsChar::is_alpha), clip)).parse(input)?;
140

            
141
140
    if let Some((_, script, _)) = script {
142
94
        language_tag.script = Some(String::from_utf8_lossy(script).to_string());
143
108
    }
144

            
145
    // Find the region, if present
146
140
    let (input, region) = opt((
147
140
        char('-'),
148
140
        alt((
149
140
            (take_while_m_n(2, 2, AsChar::is_alpha), clip),
150
140
            (take_while_m_n(3, 3, AsChar::is_dec_digit), clip),
151
140
        )),
152
140
    ))
153
140
    .parse(input)?;
154

            
155
140
    if let Some((_, (region, _))) = region {
156
114
        language_tag.region = Some(String::from_utf8_lossy(region).to_string());
157
114
    }
158

            
159
    // Find variants, is present
160
140
    let (input, variants) = many0((
161
140
        char('-'),
162
140
        alt((
163
140
            take_while_m_n(5, 8, AsChar::is_alphanum),
164
140
            recognize((
165
140
                take_while_m_n(1, 1, AsChar::is_dec_digit),
166
140
                take_while_m_n(3, 3, AsChar::is_alphanum),
167
140
            )),
168
140
        )),
169
140
    ))
170
140
    .parse(input)?;
171

            
172
140
    if !variants.is_empty() {
173
78
        language_tag.variants = variants
174
78
            .into_iter()
175
146
            .map(|(_, v)| String::from_utf8_lossy(v).to_string())
176
78
            .collect();
177
124
    }
178

            
179
    // Find extensions, if present
180
140
    let (input, extensions) = many0((
181
140
        char('-'),
182
140
        recognize((
183
140
            take_while_m_n(1, 1, is_singleton),
184
140
            many1((char('-'), take_while_m_n(2, 8, AsChar::is_alphanum))),
185
140
        )),
186
140
    ))
187
140
    .parse(input)?;
188

            
189
140
    if !extensions.is_empty() {
190
72
        language_tag.extensions = extensions
191
72
            .into_iter()
192
140
            .map(|(_, ext)| String::from_utf8_lossy(ext).to_string())
193
72
            .collect();
194
130
    }
195

            
196
    // Find private use, if present
197
140
    let (input, private_use) = opt((char('-'), private_use)).parse(input)?;
198

            
199
140
    if let Some((_, private_use)) = private_use {
200
74
        language_tag.private_use = Some(String::from_utf8_lossy(private_use).to_string());
201
128
    }
202

            
203
140
    Ok((input, language_tag))
204
142
}
205

            
206
#[cfg(test)]
207
mod tests {
208
    use super::*;
209
    use crate::test_utils::check_rem;
210
    use test_case::test_case;
211

            
212
8
    #[test_case(b"de;"; "German")]
213
    #[test_case(b"fr;"; "French")]
214
    #[test_case(b"ja;"; "Japanese")]
215
    #[test_case(b"i-enochian;"; "example of a grandfathered tag")]
216
8
    fn simple_lang_subtag(input: &[u8]) {
217
8
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
218
8
        check_rem(rem, 1);
219
8
        assert_eq!(
220
8
            &input[..(input.len() - 1)],
221
8
            language_tag.language.as_bytes()
222
8
        );
223

            
224
8
        assert!(language_tag.ext_lang.is_none());
225
8
        assert!(language_tag.script.is_none());
226
8
        assert!(language_tag.region.is_none());
227
8
        assert!(language_tag.variants.is_empty());
228
8
        assert!(language_tag.extensions.is_empty());
229
8
        assert!(language_tag.private_use.is_none());
230
8
    }
231

            
232
8
    #[test_case(b"zh-Hant;"; "Chinese written using the Traditional Chinese script")]
233
    #[test_case(b"zh-Hans;"; "Chinese written using the Simplified Chinese script")]
234
    #[test_case(b"sr-Cyrl;"; "Serbian written using the Cyrillic script")]
235
    #[test_case(b"sr-Latn;"; "Serbian written using the Latin script")]
236
8
    fn language_subtag_plug_script_subtag(input: &[u8]) {
237
8
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
238
8
        check_rem(rem, 1);
239
8

            
240
8
        let str = String::from_utf8(input[..(input.len() - 1)].to_vec()).unwrap();
241
8
        assert_eq!(
242
8
            str.split('-').next().unwrap().as_bytes(),
243
8
            language_tag.language.as_bytes()
244
8
        );
245
8
        assert_eq!(
246
8
            Some(str.split('-').nth(1).unwrap().to_string()),
247
8
            language_tag.script
248
8
        );
249

            
250
8
        assert!(language_tag.ext_lang.is_none());
251
8
        assert!(language_tag.region.is_none());
252
8
        assert!(language_tag.variants.is_empty());
253
8
        assert!(language_tag.extensions.is_empty());
254
8
        assert!(language_tag.private_use.is_none());
255
8
    }
256

            
257
8
    #[test_case(b"zh-cmn-Hans-CN;"; "Chinese, Mandarin, Simplified script, as used in China")]
258
    #[test_case(b"cmn-Hans-CN;"; "Mandarin Chinese, Simplified script, as used in China")]
259
    #[test_case(b"zh-yue-HK;"; "Chinese, Cantonese, as used in Hong Kong SAR")]
260
    #[test_case(b"yue-HK;"; "Cantonese Chinese, as used in Hong Kong SAR")]
261
8
    fn extended_language_subtags_and_their_primary_language_subtag_counterparts(input: &[u8]) {
262
8
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
263
8
        check_rem(rem, 1);
264
8

            
265
8
        match language_tag {
266
            LanguageTag {
267
4
                language,
268
4
                ext_lang,
269
4
                script,
270
4
                region,
271
                ..
272
8
            } if language == "zh" => match region.unwrap().as_str() {
273
4
                "CN" => {
274
2
                    assert_eq!(Some("cmn".to_string()), ext_lang);
275
2
                    assert_eq!(Some("Hans".to_string()), script);
276
                }
277
2
                "HK" => {
278
2
                    assert_eq!(Some("yue".to_string()), ext_lang);
279
                }
280
                _ => panic!("Unexpected region"),
281
            },
282
            LanguageTag {
283
2
                language,
284
2
                script,
285
2
                region,
286
                ..
287
4
            } if language == "cmn" => {
288
2
                assert_eq!(Some("Hans".to_string()), script);
289
2
                assert_eq!(Some("CN".to_string()), region);
290
            }
291
            LanguageTag {
292
2
                language, region, ..
293
2
            } if language == "yue" => {
294
2
                assert_eq!(Some("HK".to_string()), region);
295
            }
296
            _ => panic!("Unexpected result"),
297
        }
298
8
    }
299

            
300
4
    #[test_case(b"zh-Hans-CN;"; "Chinese written using the Simplified script as used in mainland China")]
301
    #[test_case(b"sr-Latn-RS;"; "Serbian written using the Latin script as used in Serbia")]
302
4
    fn language_script_region(input: &[u8]) {
303
4
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
304
4
        check_rem(rem, 1);
305
4

            
306
4
        match language_tag {
307
            LanguageTag {
308
2
                language,
309
2
                script,
310
2
                region,
311
                ..
312
4
            } if language == "zh" => {
313
2
                assert_eq!(Some("Hans".to_string()), script);
314
2
                assert_eq!(Some("CN".to_string()), region);
315
            }
316
            LanguageTag {
317
2
                language,
318
2
                script,
319
2
                region,
320
                ..
321
2
            } if language == "sr" => {
322
2
                assert_eq!(Some("Latn".to_string()), script);
323
2
                assert_eq!(Some("RS".to_string()), region);
324
            }
325
            _ => panic!("Unexpected result"),
326
        }
327
4
    }
328

            
329
6
    #[test_case(b"sl-rozaj;"; "Resian dialect of Slovenian")]
330
    #[test_case(b"sl-rozaj-biske;"; "San Giorgio dialect of Resian dialect of Slovenian")]
331
    #[test_case(b"sl-nedis;"; "Nadiza dialect of Slovenian")]
332
6
    fn language_variant(input: &[u8]) {
333
6
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
334
6
        check_rem(rem, 1);
335
6

            
336
6
        match language_tag {
337
            LanguageTag {
338
6
                language, variants, ..
339
6
            } if language == "sl" => {
340
6
                match variants.first().unwrap().as_str() {
341
6
                    "rozaj" => {
342
4
                        if variants.len() == 1 {
343
2
                            // Okay
344
2
                        } else if variants.len() == 2 {
345
2
                            assert_eq!("biske", variants.last().unwrap().as_str());
346
                        } else {
347
                            panic!("Unexpected number of variants")
348
                        }
349
                    }
350
2
                    "nedis" => {
351
2
                        assert_eq!(1, variants.len());
352
                    }
353
                    _ => panic!("Unexpected variant"),
354
                }
355
            }
356
            _ => panic!("Unexpected result"),
357
        }
358
6
    }
359

            
360
4
    #[test_case(b"de-CH-1901;"; "German as used in Switzerland using the 1901 variant [orthography]")]
361
    #[test_case(b"sl-IT-nedis;"; "Slovenian as used in Italy, Nadiza dialect")]
362
4
    fn language_region_variant(input: &[u8]) {
363
4
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
364
4
        check_rem(rem, 1);
365
4

            
366
4
        match language_tag {
367
            LanguageTag {
368
2
                language,
369
2
                region,
370
2
                variants,
371
                ..
372
4
            } if language == "de" => {
373
2
                assert_eq!(Some("CH".to_string()), region);
374
2
                assert_eq!(1, variants.len());
375
2
                assert_eq!("1901", variants.first().unwrap().as_str());
376
            }
377
            LanguageTag {
378
2
                language,
379
2
                region,
380
2
                variants,
381
                ..
382
2
            } if language == "sl" => {
383
2
                assert_eq!(Some("IT".to_string()), region);
384
2
                assert_eq!(1, variants.len());
385
2
                assert_eq!("nedis", variants.first().unwrap().as_str());
386
            }
387
            _ => panic!("Unexpected result"),
388
        }
389
4
    }
390

            
391
2
    #[test_case(b"hy-Latn-IT-arevela;"; "Eastern Armenian written in Latin script, as used in Italy")]
392
2
    fn language_script_region_variant(input: &[u8]) {
393
2
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
394
2
        check_rem(rem, 1);
395
2

            
396
2
        match language_tag {
397
            LanguageTag {
398
2
                language,
399
2
                script,
400
2
                region,
401
2
                variants,
402
                ..
403
2
            } if language == "hy" => {
404
2
                assert_eq!(Some("Latn".to_string()), script);
405
2
                assert_eq!(Some("IT".to_string()), region);
406
2
                assert_eq!(1, variants.len());
407
2
                assert_eq!("arevela", variants.first().unwrap().as_str());
408
            }
409
            _ => panic!("Unexpected result"),
410
        }
411
2
    }
412

            
413
6
    #[test_case(b"de-DE;"; "German for Germany")]
414
    #[test_case(b"en-US;"; "English as used in the United States")]
415
    #[test_case(b"es-419;"; "Spanish appropriate for the Latin America and Caribbean region using the UN region code")]
416
6
    fn language_region(input: &[u8]) {
417
6
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
418
6
        check_rem(rem, 1);
419
6

            
420
6
        match language_tag {
421
            LanguageTag {
422
2
                language, region, ..
423
6
            } if language == "de" => {
424
2
                assert_eq!(Some("DE".to_string()), region);
425
            }
426
            LanguageTag {
427
2
                language, region, ..
428
4
            } if language == "en" => {
429
2
                assert_eq!(Some("US".to_string()), region);
430
            }
431
            LanguageTag {
432
2
                language, region, ..
433
2
            } if language == "es" => {
434
2
                assert_eq!(Some("419".to_string()), region);
435
            }
436
            _ => panic!("Unexpected result"),
437
        }
438
6
    }
439

            
440
4
    #[test_case(b"de-CH-x-phonebk;"; "phonebk")]
441
    #[test_case(b"az-Arab-x-AZE-derbend;"; "derbend")]
442
4
    fn private_use_subtags(input: &[u8]) {
443
4
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
444
4
        check_rem(rem, 1);
445
4

            
446
4
        match language_tag {
447
            LanguageTag {
448
2
                language,
449
2
                region,
450
2
                private_use,
451
                ..
452
4
            } if language == "de" => {
453
2
                assert_eq!(Some("CH".to_string()), region);
454
2
                assert_eq!(Some("x-phonebk".to_string()), private_use);
455
            }
456
            LanguageTag {
457
2
                language,
458
2
                script,
459
2
                private_use,
460
                ..
461
2
            } if language == "az" => {
462
2
                assert_eq!(Some("Arab".to_string()), script);
463
2
                assert_eq!(Some("x-AZE-derbend".to_string()), private_use);
464
            }
465
            _ => panic!("Unexpected result"),
466
        }
467
4
    }
468

            
469
10
    #[test_case(b"x-whatever;"; "private use using the singleton 'x'")]
470
    #[test_case(b"qaa-Qaaa-QM-x-southern;"; "all private tags")]
471
    #[test_case(b"de-Qaaa;"; "German, with a private script")]
472
    #[test_case(b"sr-Latn-QM;"; "Serbian, Latin script, private region")]
473
    #[test_case(b"sr-Qaaa-RS;"; "Serbian, private script, for Serbia")]
474
10
    fn private_use_registry_values(input: &[u8]) {
475
10
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
476
10
        check_rem(rem, 1);
477
10

            
478
10
        match language_tag {
479
10
            LanguageTag { language, .. } if language == "x-whatever" => {
480
2
                // Okay
481
2
            }
482
            LanguageTag {
483
2
                language,
484
2
                script,
485
2
                region,
486
2
                private_use,
487
                ..
488
8
            } if language == "qaa" => {
489
2
                assert_eq!(Some("Qaaa".to_string()), script);
490
2
                assert_eq!(Some("QM".to_string()), region);
491
2
                assert_eq!(Some("x-southern".to_string()), private_use);
492
            }
493
            LanguageTag {
494
2
                language, script, ..
495
6
            } if language == "de" => {
496
2
                assert_eq!(Some("Qaaa".to_string()), script);
497
            }
498
            LanguageTag {
499
4
                language,
500
4
                script,
501
4
                region,
502
                ..
503
4
            } if language == "sr" => match script.unwrap().as_str() {
504
4
                "Latn" => {
505
2
                    assert_eq!(Some("QM".to_string()), region);
506
                }
507
2
                "Qaaa" => {
508
2
                    assert_eq!(Some("RS".to_string()), region);
509
                }
510
                _ => panic!("Unexpected script"),
511
            },
512
            _ => panic!("Unexpected result"),
513
        }
514
10
    }
515

            
516
6
    #[test_case(b"en-US-u-islamcal;"; "islamcal")]
517
    #[test_case(b"zh-CN-a-myext-x-private;"; "myext and private")]
518
    #[test_case(b"en-a-myext-b-another;"; "myext and another")]
519
6
    fn tags_that_use_extensions(input: &[u8]) {
520
6
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
521
6
        check_rem(rem, 1);
522
6

            
523
6
        match language_tag {
524
            LanguageTag {
525
4
                language,
526
4
                region,
527
4
                extensions,
528
                ..
529
6
            } if language == "en" => {
530
4
                if extensions.len() == 1 {
531
2
                    assert_eq!(Some("US".to_string()), region);
532
2
                    assert_eq!(1, extensions.len());
533
2
                    assert_eq!("u-islamcal", extensions.first().unwrap().as_str());
534
2
                } else if extensions.len() == 2 {
535
2
                    assert_eq!("a-myext", extensions.first().unwrap().as_str());
536
2
                    assert_eq!("b-another", extensions.last().unwrap().as_str());
537
                } else {
538
                    panic!("Unexpected number of extensions")
539
                }
540
            }
541
            LanguageTag {
542
2
                language,
543
2
                region,
544
2
                extensions,
545
2
                private_use,
546
                ..
547
2
            } if language == "zh" => {
548
2
                assert_eq!(Some("CN".to_string()), region);
549
2
                assert_eq!(1, extensions.len());
550
2
                assert_eq!("a-myext", extensions.first().unwrap().as_str());
551
2
                assert_eq!(Some("x-private".to_string()), private_use);
552
            }
553
            _ => panic!("Unexpected result"),
554
        }
555
6
    }
556

            
557
4
    #[test_case(b"de-419-DE;"; "two region tags")]
558
    #[test_case(b"a-DE;"; "use of a single-character subtag in primary position; note that there are a few grandfathered tags that start with \"i-\" that are valid")]
559
    // This is not a parser failure but a content validation failure -> #[test_case(b"ar-a-aaa-b-bbb-a-ccc;"; "two extensions with same single-letter prefix")]
560
4
    fn some_invalid_tags(input: &[u8]) {
561
4
        let r = language_tag::<Error>(input);
562
2
        match r {
563
2
            Err(nom::Err::Error(_)) => {}
564
2
            Ok((rem, lang)) => assert!(rem.len() > 1, "Created lang: {lang:?}"),
565
            r => panic!("Unexpected result: {r:?}"),
566
        }
567
4
    }
568
}