1
//! <https://www.rfc-editor.org/rfc/rfc5646.txt>
2

            
3
use crate::common::LanguageTag;
4
use crate::parser::Error;
5
use nom::branch::alt;
6
use nom::bytes::streaming::{tag, take_while_m_n};
7
use nom::character::streaming::char;
8
use nom::character::{is_alphabetic, is_alphanumeric, is_digit};
9
use nom::combinator::{opt, peek, recognize, verify};
10
use nom::error::ParseError;
11
use nom::multi::{many0, many1, many_m_n};
12
use nom::sequence::tuple;
13
use nom::{IResult, Parser};
14

            
15
#[inline]
16
360
const fn is_singleton(b: u8) -> bool {
17
360
    matches!(b, b'\x30'..=b'\x39' | b'\x41'..=b'\x57' | b'\x59'..=b'\x5A' | b'\x61'..=b'\x77' | b'\x79'..=b'\x7A')
18
360
}
19

            
20
220
fn private_use<'a, E>(input: &'a [u8]) -> IResult<&'a [u8], &'a [u8], E>
21
220
where
22
220
    E: ParseError<&'a [u8]> + From<Error<'a>>,
23
220
{
24
220
    recognize(tuple((
25
220
        char('x'),
26
220
        many1(tuple((char('-'), take_while_m_n(1, 8, is_alphanumeric)))),
27
220
    )))(input)
28
220
}
29

            
30
146
pub fn language_tag<'a, E>(input: &'a [u8]) -> IResult<&'a [u8], LanguageTag, E>
31
146
where
32
146
    E: ParseError<&'a [u8]> + From<Error<'a>>,
33
146
{
34
146
    let (input, grandfathered_irregular) = opt(alt((
35
146
        tag("en-GB-oed"),
36
146
        tag("i-ami"),
37
146
        tag("i-bnn"),
38
146
        tag("i-default"),
39
146
        tag("i-enochian"),
40
146
        tag("i-hak"),
41
146
        tag("i-klingon"),
42
146
        tag("i-lux"),
43
146
        tag("i-mingo"),
44
146
        tag("i-navajo"),
45
146
        tag("i-pwn"),
46
146
        tag("i-tao"),
47
146
        tag("i-tay"),
48
146
        tag("i-tsu"),
49
146
        tag("sgn-BE-FR"),
50
146
        tag("sgn-BE-NL"),
51
146
        tag("sgn-CH-DE"),
52
146
    )))(input)?;
53

            
54
146
    if let Some(grandfathered_irregular) = grandfathered_irregular {
55
2
        let language_tag = LanguageTag {
56
2
            language: String::from_utf8_lossy(grandfathered_irregular).to_string(),
57
2
            ext_lang: None,
58
2
            script: None,
59
2
            region: None,
60
2
            variants: Vec::with_capacity(0),
61
2
            extensions: Vec::with_capacity(0),
62
2
            private_use: None,
63
2
        };
64
2

            
65
2
        return Ok((input, language_tag));
66
144
    }
67

            
68
144
    let (input, private_use) = opt(private_use)(input)?;
69
144
    if let Some(private_use) = private_use {
70
2
        let language_tag = LanguageTag {
71
2
            language: String::from_utf8_lossy(private_use).to_string(),
72
2
            ext_lang: None,
73
2
            script: None,
74
2
            region: None,
75
2
            variants: Vec::with_capacity(0),
76
2
            extensions: Vec::with_capacity(0),
77
2
            private_use: None,
78
2
        };
79
2

            
80
2
        return Ok((input, language_tag));
81
142
    }
82
142

            
83
142
    lang_tag(input)
84
146
}
85

            
86
/// Peeks at the next byte and checks that
87
///   - If the next byte is a `-`, then accept
88
///   - If the next byte is alphanumeric, then reject
89
///
90
/// This can be used to prevent bad matches that end in the middle of a component.
91
390
fn clip<'a, E>(input: &'a [u8]) -> IResult<&'a [u8], &'a [u8], E>
92
390
where
93
390
    E: ParseError<&'a [u8]> + From<Error<'a>>,
94
390
{
95
390
    peek(verify(
96
12698
        take_while_m_n(0, 1, |c| c == b'-' || is_alphanumeric(c)),
97
390
        |m: &[u8]| m == [b'-'] || m.is_empty(),
98
390
    ))(input)
99
390
}
100

            
101
142
pub fn lang_tag<'a, E>(input: &'a [u8]) -> IResult<&'a [u8], LanguageTag, E>
102
142
where
103
142
    E: ParseError<&'a [u8]> + From<Error<'a>>,
104
142
{
105
142
    let (input, (language, ext_lang)) = alt((
106
142
        tuple((
107
142
            take_while_m_n(2, 3, is_alphabetic),
108
142
            opt(tuple((
109
142
                char('-'),
110
142
                recognize(tuple((
111
142
                    take_while_m_n(3, 3, is_alphabetic),
112
142
                    many_m_n(
113
142
                        0,
114
142
                        2,
115
142
                        tuple((char('-'), take_while_m_n(3, 3, is_alphabetic), clip)),
116
142
                    ),
117
142
                    clip,
118
142
                ))),
119
142
            ))),
120
142
        )),
121
142
        take_while_m_n(4, 4, is_alphabetic).map(|l| (l, None)),
122
142
        take_while_m_n(5, 8, is_alphabetic).map(|l| (l, None)),
123
142
    ))(input)?;
124

            
125
140
    let mut language_tag = LanguageTag {
126
140
        language: String::from_utf8_lossy(language).to_string(),
127
140
        ext_lang: ext_lang.map(|(_, ext_lang)| String::from_utf8_lossy(ext_lang).to_string()),
128
140
        script: None,
129
140
        region: None,
130
140
        variants: Vec::with_capacity(0),
131
140
        extensions: Vec::with_capacity(0),
132
140
        private_use: None,
133
140
    };
134

            
135
    // Find the script, if present
136
140
    let (input, script) = opt(tuple((
137
140
        char('-'),
138
140
        take_while_m_n(4, 4, is_alphabetic),
139
140
        clip,
140
140
    )))(input)?;
141

            
142
140
    if let Some((_, script, _)) = script {
143
94
        language_tag.script = Some(String::from_utf8_lossy(script).to_string());
144
108
    }
145

            
146
    // Find the region, if present
147
140
    let (input, region) = opt(tuple((
148
140
        char('-'),
149
140
        alt((
150
140
            tuple((take_while_m_n(2, 2, is_alphabetic), clip)),
151
140
            tuple((take_while_m_n(3, 3, is_digit), clip)),
152
140
        )),
153
140
    )))(input)?;
154

            
155
140
    if let Some((_, (region, _))) = region {
156
114
        language_tag.region = Some(String::from_utf8_lossy(region).to_string());
157
114
    }
158

            
159
    // Find variants, is present
160
140
    let (input, variants) = many0(tuple((
161
140
        char('-'),
162
140
        alt((
163
140
            take_while_m_n(5, 8, is_alphanumeric),
164
140
            recognize(tuple((
165
140
                take_while_m_n(1, 1, is_digit),
166
140
                take_while_m_n(3, 3, is_alphanumeric),
167
140
            ))),
168
140
        )),
169
140
    )))(input)?;
170

            
171
140
    if !variants.is_empty() {
172
78
        language_tag.variants = variants
173
78
            .into_iter()
174
146
            .map(|(_, v)| String::from_utf8_lossy(v).to_string())
175
78
            .collect();
176
124
    }
177

            
178
    // Find extensions, if present
179
140
    let (input, extensions) = many0(tuple((
180
140
        char('-'),
181
140
        recognize(tuple((
182
140
            take_while_m_n(1, 1, is_singleton),
183
140
            many1(tuple((char('-'), take_while_m_n(2, 8, is_alphanumeric)))),
184
140
        ))),
185
140
    )))(input)?;
186

            
187
140
    if !extensions.is_empty() {
188
72
        language_tag.extensions = extensions
189
72
            .into_iter()
190
140
            .map(|(_, ext)| String::from_utf8_lossy(ext).to_string())
191
72
            .collect();
192
130
    }
193

            
194
    // Find private use, if present
195
140
    let (input, private_use) = opt(tuple((char('-'), private_use)))(input)?;
196

            
197
140
    if let Some((_, private_use)) = private_use {
198
74
        language_tag.private_use = Some(String::from_utf8_lossy(private_use).to_string());
199
128
    }
200

            
201
140
    Ok((input, language_tag))
202
142
}
203

            
204
#[cfg(test)]
205
mod tests {
206
    use super::*;
207
    use crate::test_utils::check_rem;
208
    use test_case::test_case;
209

            
210
8
    #[test_case(b"de;"; "German")]
211
    #[test_case(b"fr;"; "French")]
212
    #[test_case(b"ja;"; "Japanese")]
213
    #[test_case(b"i-enochian;"; "example of a grandfathered tag")]
214
8
    fn simple_lang_subtag(input: &[u8]) {
215
8
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
216
8
        check_rem(rem, 1);
217
8
        assert_eq!(
218
8
            &input[..(input.len() - 1)],
219
8
            language_tag.language.as_bytes()
220
8
        );
221

            
222
8
        assert!(language_tag.ext_lang.is_none());
223
8
        assert!(language_tag.script.is_none());
224
8
        assert!(language_tag.region.is_none());
225
8
        assert!(language_tag.variants.is_empty());
226
8
        assert!(language_tag.extensions.is_empty());
227
8
        assert!(language_tag.private_use.is_none());
228
8
    }
229

            
230
8
    #[test_case(b"zh-Hant;"; "Chinese written using the Traditional Chinese script")]
231
    #[test_case(b"zh-Hans;"; "Chinese written using the Simplified Chinese script")]
232
    #[test_case(b"sr-Cyrl;"; "Serbian written using the Cyrillic script")]
233
    #[test_case(b"sr-Latn;"; "Serbian written using the Latin script")]
234
8
    fn language_subtag_plug_script_subtag(input: &[u8]) {
235
8
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
236
8
        check_rem(rem, 1);
237
8

            
238
8
        let str = String::from_utf8(input[..(input.len() - 1)].to_vec()).unwrap();
239
8
        assert_eq!(
240
8
            str.split('-').next().unwrap().as_bytes(),
241
8
            language_tag.language.as_bytes()
242
8
        );
243
8
        assert_eq!(
244
8
            Some(str.split('-').nth(1).unwrap().to_string()),
245
8
            language_tag.script
246
8
        );
247

            
248
8
        assert!(language_tag.ext_lang.is_none());
249
8
        assert!(language_tag.region.is_none());
250
8
        assert!(language_tag.variants.is_empty());
251
8
        assert!(language_tag.extensions.is_empty());
252
8
        assert!(language_tag.private_use.is_none());
253
8
    }
254

            
255
8
    #[test_case(b"zh-cmn-Hans-CN;"; "Chinese, Mandarin, Simplified script, as used in China")]
256
    #[test_case(b"cmn-Hans-CN;"; "Mandarin Chinese, Simplified script, as used in China")]
257
    #[test_case(b"zh-yue-HK;"; "Chinese, Cantonese, as used in Hong Kong SAR")]
258
    #[test_case(b"yue-HK;"; "Cantonese Chinese, as used in Hong Kong SAR")]
259
8
    fn extended_language_subtags_and_their_primary_language_subtag_counterparts(input: &[u8]) {
260
8
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
261
8
        check_rem(rem, 1);
262
8

            
263
8
        match language_tag {
264
            LanguageTag {
265
4
                language,
266
4
                ext_lang,
267
4
                script,
268
4
                region,
269
                ..
270
8
            } if language == "zh" => match region.unwrap().as_str() {
271
4
                "CN" => {
272
2
                    assert_eq!(Some("cmn".to_string()), ext_lang);
273
2
                    assert_eq!(Some("Hans".to_string()), script);
274
                }
275
2
                "HK" => {
276
2
                    assert_eq!(Some("yue".to_string()), ext_lang);
277
                }
278
                _ => panic!("Unexpected region"),
279
            },
280
            LanguageTag {
281
2
                language,
282
2
                script,
283
2
                region,
284
                ..
285
4
            } if language == "cmn" => {
286
2
                assert_eq!(Some("Hans".to_string()), script);
287
2
                assert_eq!(Some("CN".to_string()), region);
288
            }
289
            LanguageTag {
290
2
                language, region, ..
291
2
            } if language == "yue" => {
292
2
                assert_eq!(Some("HK".to_string()), region);
293
            }
294
            _ => panic!("Unexpected result"),
295
        }
296
8
    }
297

            
298
4
    #[test_case(b"zh-Hans-CN;"; "Chinese written using the Simplified script as used in mainland China")]
299
    #[test_case(b"sr-Latn-RS;"; "Serbian written using the Latin script as used in Serbia")]
300
4
    fn language_script_region(input: &[u8]) {
301
4
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
302
4
        check_rem(rem, 1);
303
4

            
304
4
        match language_tag {
305
            LanguageTag {
306
2
                language,
307
2
                script,
308
2
                region,
309
                ..
310
4
            } if language == "zh" => {
311
2
                assert_eq!(Some("Hans".to_string()), script);
312
2
                assert_eq!(Some("CN".to_string()), region);
313
            }
314
            LanguageTag {
315
2
                language,
316
2
                script,
317
2
                region,
318
                ..
319
2
            } if language == "sr" => {
320
2
                assert_eq!(Some("Latn".to_string()), script);
321
2
                assert_eq!(Some("RS".to_string()), region);
322
            }
323
            _ => panic!("Unexpected result"),
324
        }
325
4
    }
326

            
327
6
    #[test_case(b"sl-rozaj;"; "Resian dialect of Slovenian")]
328
    #[test_case(b"sl-rozaj-biske;"; "San Giorgio dialect of Resian dialect of Slovenian")]
329
    #[test_case(b"sl-nedis;"; "Nadiza dialect of Slovenian")]
330
6
    fn language_variant(input: &[u8]) {
331
6
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
332
6
        check_rem(rem, 1);
333
6

            
334
6
        match language_tag {
335
            LanguageTag {
336
6
                language, variants, ..
337
6
            } if language == "sl" => {
338
6
                match variants.first().unwrap().as_str() {
339
6
                    "rozaj" => {
340
4
                        if variants.len() == 1 {
341
2
                            // Okay
342
2
                        } else if variants.len() == 2 {
343
2
                            assert_eq!("biske", variants.last().unwrap().as_str());
344
                        } else {
345
                            panic!("Unexpected number of variants")
346
                        }
347
                    }
348
2
                    "nedis" => {
349
2
                        assert_eq!(1, variants.len());
350
                    }
351
                    _ => panic!("Unexpected variant"),
352
                }
353
            }
354
            _ => panic!("Unexpected result"),
355
        }
356
6
    }
357

            
358
4
    #[test_case(b"de-CH-1901;"; "German as used in Switzerland using the 1901 variant [orthography]")]
359
    #[test_case(b"sl-IT-nedis;"; "Slovenian as used in Italy, Nadiza dialect")]
360
4
    fn language_region_variant(input: &[u8]) {
361
4
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
362
4
        check_rem(rem, 1);
363
4

            
364
4
        match language_tag {
365
            LanguageTag {
366
2
                language,
367
2
                region,
368
2
                variants,
369
                ..
370
4
            } if language == "de" => {
371
2
                assert_eq!(Some("CH".to_string()), region);
372
2
                assert_eq!(1, variants.len());
373
2
                assert_eq!("1901", variants.first().unwrap().as_str());
374
            }
375
            LanguageTag {
376
2
                language,
377
2
                region,
378
2
                variants,
379
                ..
380
2
            } if language == "sl" => {
381
2
                assert_eq!(Some("IT".to_string()), region);
382
2
                assert_eq!(1, variants.len());
383
2
                assert_eq!("nedis", variants.first().unwrap().as_str());
384
            }
385
            _ => panic!("Unexpected result"),
386
        }
387
4
    }
388

            
389
2
    #[test_case(b"hy-Latn-IT-arevela;"; "Eastern Armenian written in Latin script, as used in Italy")]
390
2
    fn language_script_region_variant(input: &[u8]) {
391
2
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
392
2
        check_rem(rem, 1);
393
2

            
394
2
        match language_tag {
395
            LanguageTag {
396
2
                language,
397
2
                script,
398
2
                region,
399
2
                variants,
400
                ..
401
2
            } if language == "hy" => {
402
2
                assert_eq!(Some("Latn".to_string()), script);
403
2
                assert_eq!(Some("IT".to_string()), region);
404
2
                assert_eq!(1, variants.len());
405
2
                assert_eq!("arevela", variants.first().unwrap().as_str());
406
            }
407
            _ => panic!("Unexpected result"),
408
        }
409
2
    }
410

            
411
6
    #[test_case(b"de-DE;"; "German for Germany")]
412
    #[test_case(b"en-US;"; "English as used in the United States")]
413
    #[test_case(b"es-419;"; "Spanish appropriate for the Latin America and Caribbean region using the UN region code")]
414
6
    fn language_region(input: &[u8]) {
415
6
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
416
6
        check_rem(rem, 1);
417
6

            
418
6
        match language_tag {
419
            LanguageTag {
420
2
                language, region, ..
421
6
            } if language == "de" => {
422
2
                assert_eq!(Some("DE".to_string()), region);
423
            }
424
            LanguageTag {
425
2
                language, region, ..
426
4
            } if language == "en" => {
427
2
                assert_eq!(Some("US".to_string()), region);
428
            }
429
            LanguageTag {
430
2
                language, region, ..
431
2
            } if language == "es" => {
432
2
                assert_eq!(Some("419".to_string()), region);
433
            }
434
            _ => panic!("Unexpected result"),
435
        }
436
6
    }
437

            
438
4
    #[test_case(b"de-CH-x-phonebk;"; "phonebk")]
439
    #[test_case(b"az-Arab-x-AZE-derbend;"; "derbend")]
440
4
    fn private_use_subtags(input: &[u8]) {
441
4
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
442
4
        check_rem(rem, 1);
443
4

            
444
4
        match language_tag {
445
            LanguageTag {
446
2
                language,
447
2
                region,
448
2
                private_use,
449
                ..
450
4
            } if language == "de" => {
451
2
                assert_eq!(Some("CH".to_string()), region);
452
2
                assert_eq!(Some("x-phonebk".to_string()), private_use);
453
            }
454
            LanguageTag {
455
2
                language,
456
2
                script,
457
2
                private_use,
458
                ..
459
2
            } if language == "az" => {
460
2
                assert_eq!(Some("Arab".to_string()), script);
461
2
                assert_eq!(Some("x-AZE-derbend".to_string()), private_use);
462
            }
463
            _ => panic!("Unexpected result"),
464
        }
465
4
    }
466

            
467
10
    #[test_case(b"x-whatever;"; "private use using the singleton 'x'")]
468
    #[test_case(b"qaa-Qaaa-QM-x-southern;"; "all private tags")]
469
    #[test_case(b"de-Qaaa;"; "German, with a private script")]
470
    #[test_case(b"sr-Latn-QM;"; "Serbian, Latin script, private region")]
471
    #[test_case(b"sr-Qaaa-RS;"; "Serbian, private script, for Serbia")]
472
10
    fn private_use_registry_values(input: &[u8]) {
473
10
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
474
10
        check_rem(rem, 1);
475
10

            
476
10
        match language_tag {
477
10
            LanguageTag { language, .. } if language == "x-whatever" => {
478
2
                // Okay
479
2
            }
480
            LanguageTag {
481
2
                language,
482
2
                script,
483
2
                region,
484
2
                private_use,
485
                ..
486
8
            } if language == "qaa" => {
487
2
                assert_eq!(Some("Qaaa".to_string()), script);
488
2
                assert_eq!(Some("QM".to_string()), region);
489
2
                assert_eq!(Some("x-southern".to_string()), private_use);
490
            }
491
            LanguageTag {
492
2
                language, script, ..
493
6
            } if language == "de" => {
494
2
                assert_eq!(Some("Qaaa".to_string()), script);
495
            }
496
            LanguageTag {
497
4
                language,
498
4
                script,
499
4
                region,
500
                ..
501
4
            } if language == "sr" => match script.unwrap().as_str() {
502
4
                "Latn" => {
503
2
                    assert_eq!(Some("QM".to_string()), region);
504
                }
505
2
                "Qaaa" => {
506
2
                    assert_eq!(Some("RS".to_string()), region);
507
                }
508
                _ => panic!("Unexpected script"),
509
            },
510
            _ => panic!("Unexpected result"),
511
        }
512
10
    }
513

            
514
6
    #[test_case(b"en-US-u-islamcal;"; "islamcal")]
515
    #[test_case(b"zh-CN-a-myext-x-private;"; "myext and private")]
516
    #[test_case(b"en-a-myext-b-another;"; "myext and another")]
517
6
    fn tags_that_use_extensions(input: &[u8]) {
518
6
        let (rem, language_tag) = language_tag::<Error>(input).unwrap();
519
6
        check_rem(rem, 1);
520
6

            
521
6
        match language_tag {
522
            LanguageTag {
523
4
                language,
524
4
                region,
525
4
                extensions,
526
                ..
527
6
            } if language == "en" => {
528
4
                if extensions.len() == 1 {
529
2
                    assert_eq!(Some("US".to_string()), region);
530
2
                    assert_eq!(1, extensions.len());
531
2
                    assert_eq!("u-islamcal", extensions.first().unwrap().as_str());
532
2
                } else if extensions.len() == 2 {
533
2
                    assert_eq!("a-myext", extensions.first().unwrap().as_str());
534
2
                    assert_eq!("b-another", extensions.last().unwrap().as_str());
535
                } else {
536
                    panic!("Unexpected number of extensions")
537
                }
538
            }
539
            LanguageTag {
540
2
                language,
541
2
                region,
542
2
                extensions,
543
2
                private_use,
544
                ..
545
2
            } if language == "zh" => {
546
2
                assert_eq!(Some("CN".to_string()), region);
547
2
                assert_eq!(1, extensions.len());
548
2
                assert_eq!("a-myext", extensions.first().unwrap().as_str());
549
2
                assert_eq!(Some("x-private".to_string()), private_use);
550
            }
551
            _ => panic!("Unexpected result"),
552
        }
553
6
    }
554

            
555
4
    #[test_case(b"de-419-DE;"; "two region tags")]
556
    #[test_case(b"a-DE;"; "use of a single-character subtag in primary position; note that there are a few grandfathered tags that start with \"i-\" that are valid")]
557
    // This is not a parser failure but a content validation failure -> #[test_case(b"ar-a-aaa-b-bbb-a-ccc;"; "two extensions with same single-letter prefix")]
558
4
    fn some_invalid_tags(input: &[u8]) {
559
4
        let r = language_tag::<Error>(input);
560
2
        match r {
561
2
            Err(nom::Err::Error(_)) => {}
562
2
            Ok((rem, lang)) => assert!(rem.len() > 1, "Created lang: {lang:?}"),
563
            r => panic!("Unexpected result: {r:?}"),
564
        }
565
4
    }
566
}