Filename | /usr/share/perl5/I18N/LangTags.pm |
Statements | Executed 192 statements in 4.29ms |
Calls | P | F | Exclusive Time |
Inclusive Time |
Subroutine |
---|---|---|---|---|---|
1 | 1 | 1 | 25µs | 67µs | BEGIN@7 | I18N::LangTags::
1 | 1 | 1 | 21µs | 132µs | BEGIN@8 | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | alternate_language_tags | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | encode_language_tag | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | extract_language_tags | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | implicate_supers | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | implicate_supers_strictly | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | is_dialect_of | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | is_language_tag | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | locale2language_tag | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | panic_languages | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | same_language_tag | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | similarity_language_tag | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | super_languages | I18N::LangTags::
0 | 0 | 0 | 0s | 0s | uniq | I18N::LangTags::
Line | State ments |
Time on line |
Calls | Time in subs |
Code |
---|---|---|---|---|---|
1 | |||||
2 | # Time-stamp: "2004-10-06 23:26:33 ADT" | ||||
3 | # Sean M. Burke <sburke@cpan.org> | ||||
4 | |||||
5 | 1 | 42µs | require 5.000; | ||
6 | package I18N::LangTags; | ||||
7 | 2 | 85µs | 2 | 108µs | # spent 67µs (25+41) within I18N::LangTags::BEGIN@7 which was called:
# once (25µs+41µs) by Locale::Maketext::BEGIN@7 at line 7 # spent 67µs making 1 call to I18N::LangTags::BEGIN@7
# spent 41µs making 1 call to strict::import |
8 | 2 | 3.72ms | 2 | 244µs | # spent 132µs (21+111) within I18N::LangTags::BEGIN@8 which was called:
# once (21µs+111µs) by Locale::Maketext::BEGIN@7 at line 8 # spent 132µs making 1 call to I18N::LangTags::BEGIN@8
# spent 111µs making 1 call to vars::import |
9 | 1 | 900ns | require Exporter; | ||
10 | 1 | 16µs | @ISA = qw(Exporter); | ||
11 | 1 | 600ns | @EXPORT = qw(); | ||
12 | 1 | 15µs | @EXPORT_OK = qw(is_language_tag same_language_tag | ||
13 | extract_language_tags super_languages | ||||
14 | similarity_language_tag is_dialect_of | ||||
15 | locale2language_tag alternate_language_tags | ||||
16 | encode_language_tag panic_languages | ||||
17 | implicate_supers | ||||
18 | implicate_supers_strictly | ||||
19 | ); | ||||
20 | 1 | 4µs | %EXPORT_TAGS = ('ALL' => \@EXPORT_OK); | ||
21 | |||||
22 | 1 | 700ns | $VERSION = "0.38"; | ||
23 | |||||
24 | sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function | ||||
25 | |||||
26 | |||||
27 | =head1 NAME | ||||
28 | |||||
29 | I18N::LangTags - functions for dealing with RFC3066-style language tags | ||||
30 | |||||
31 | =head1 SYNOPSIS | ||||
32 | |||||
33 | use I18N::LangTags(); | ||||
34 | |||||
35 | ...or specify whichever of those functions you want to import, like so: | ||||
36 | |||||
37 | use I18N::LangTags qw(implicate_supers similarity_language_tag); | ||||
38 | |||||
39 | All the exportable functions are listed below -- you're free to import | ||||
40 | only some, or none at all. By default, none are imported. If you | ||||
41 | say: | ||||
42 | |||||
43 | use I18N::LangTags qw(:ALL) | ||||
44 | |||||
45 | ...then all are exported. (This saves you from having to use | ||||
46 | something less obvious like C<use I18N::LangTags qw(/./)>.) | ||||
47 | |||||
48 | If you don't import any of these functions, assume a C<&I18N::LangTags::> | ||||
49 | in front of all the function names in the following examples. | ||||
50 | |||||
51 | =head1 DESCRIPTION | ||||
52 | |||||
53 | Language tags are a formalism, described in RFC 3066 (obsoleting | ||||
54 | 1766), for declaring what language form (language and possibly | ||||
55 | dialect) a given chunk of information is in. | ||||
56 | |||||
57 | This library provides functions for common tasks involving language | ||||
58 | tags as they are needed in a variety of protocols and applications. | ||||
59 | |||||
60 | Please see the "See Also" references for a thorough explanation | ||||
61 | of how to correctly use language tags. | ||||
62 | |||||
63 | =over | ||||
64 | |||||
65 | =cut | ||||
66 | |||||
67 | ########################################################################### | ||||
68 | |||||
69 | =item * the function is_language_tag($lang1) | ||||
70 | |||||
71 | Returns true iff $lang1 is a formally valid language tag. | ||||
72 | |||||
73 | is_language_tag("fr") is TRUE | ||||
74 | is_language_tag("x-jicarilla") is FALSE | ||||
75 | (Subtags can be 8 chars long at most -- 'jicarilla' is 9) | ||||
76 | |||||
77 | is_language_tag("sgn-US") is TRUE | ||||
78 | (That's American Sign Language) | ||||
79 | |||||
80 | is_language_tag("i-Klikitat") is TRUE | ||||
81 | (True without regard to the fact noone has actually | ||||
82 | registered Klikitat -- it's a formally valid tag) | ||||
83 | |||||
84 | is_language_tag("fr-patois") is TRUE | ||||
85 | (Formally valid -- altho descriptively weak!) | ||||
86 | |||||
87 | is_language_tag("Spanish") is FALSE | ||||
88 | is_language_tag("french-patois") is FALSE | ||||
89 | (No good -- first subtag has to match | ||||
90 | /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) | ||||
91 | |||||
92 | is_language_tag("x-borg-prot2532") is TRUE | ||||
93 | (Yes, subtags can contain digits, as of RFC3066) | ||||
94 | |||||
95 | =cut | ||||
96 | |||||
97 | sub is_language_tag { | ||||
98 | |||||
99 | ## Changes in the language tagging standards may have to be reflected here. | ||||
100 | |||||
101 | my($tag) = lc($_[0]); | ||||
102 | |||||
103 | return 0 if $tag eq "i" or $tag eq "x"; | ||||
104 | # Bad degenerate cases that the following | ||||
105 | # regexp would erroneously let pass | ||||
106 | |||||
107 | return $tag =~ | ||||
108 | /^(?: # First subtag | ||||
109 | [xi] | [a-z]{2,3} | ||||
110 | ) | ||||
111 | (?: # Subtags thereafter | ||||
112 | - # separator | ||||
113 | [a-z0-9]{1,8} # subtag | ||||
114 | )* | ||||
115 | $/xs ? 1 : 0; | ||||
116 | } | ||||
117 | |||||
118 | ########################################################################### | ||||
119 | |||||
120 | =item * the function extract_language_tags($whatever) | ||||
121 | |||||
122 | Returns a list of whatever looks like formally valid language tags | ||||
123 | in $whatever. Not very smart, so don't get too creative with | ||||
124 | what you want to feed it. | ||||
125 | |||||
126 | extract_language_tags("fr, fr-ca, i-mingo") | ||||
127 | returns: ('fr', 'fr-ca', 'i-mingo') | ||||
128 | |||||
129 | extract_language_tags("It's like this: I'm in fr -- French!") | ||||
130 | returns: ('It', 'in', 'fr') | ||||
131 | (So don't just feed it any old thing.) | ||||
132 | |||||
133 | The output is untainted. If you don't know what tainting is, | ||||
134 | don't worry about it. | ||||
135 | |||||
136 | =cut | ||||
137 | |||||
138 | sub extract_language_tags { | ||||
139 | |||||
140 | ## Changes in the language tagging standards may have to be reflected here. | ||||
141 | |||||
142 | my($text) = | ||||
143 | $_[0] =~ m/(.+)/ # to make for an untainted result | ||||
144 | ? $1 : '' | ||||
145 | ; | ||||
146 | |||||
147 | return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags | ||||
148 | $text =~ | ||||
149 | m/ | ||||
150 | \b | ||||
151 | (?: # First subtag | ||||
152 | [iIxX] | [a-zA-Z]{2,3} | ||||
153 | ) | ||||
154 | (?: # Subtags thereafter | ||||
155 | - # separator | ||||
156 | [a-zA-Z0-9]{1,8} # subtag | ||||
157 | )* | ||||
158 | \b | ||||
159 | /xsg | ||||
160 | ); | ||||
161 | } | ||||
162 | |||||
163 | ########################################################################### | ||||
164 | |||||
165 | =item * the function same_language_tag($lang1, $lang2) | ||||
166 | |||||
167 | Returns true iff $lang1 and $lang2 are acceptable variant tags | ||||
168 | representing the same language-form. | ||||
169 | |||||
170 | same_language_tag('x-kadara', 'i-kadara') is TRUE | ||||
171 | (The x/i- alternation doesn't matter) | ||||
172 | same_language_tag('X-KADARA', 'i-kadara') is TRUE | ||||
173 | (...and neither does case) | ||||
174 | same_language_tag('en', 'en-US') is FALSE | ||||
175 | (all-English is not the SAME as US English) | ||||
176 | same_language_tag('x-kadara', 'x-kadar') is FALSE | ||||
177 | (these are totally unrelated tags) | ||||
178 | same_language_tag('no-bok', 'nb') is TRUE | ||||
179 | (no-bok is a legacy tag for nb (Norwegian Bokmal)) | ||||
180 | |||||
181 | C<same_language_tag> works by just seeing whether | ||||
182 | C<encode_language_tag($lang1)> is the same as | ||||
183 | C<encode_language_tag($lang2)>. | ||||
184 | |||||
185 | (Yes, I know this function is named a bit oddly. Call it historic | ||||
186 | reasons.) | ||||
187 | |||||
188 | =cut | ||||
189 | |||||
190 | sub same_language_tag { | ||||
191 | my $el1 = &encode_language_tag($_[0]); | ||||
192 | return 0 unless defined $el1; | ||||
193 | # this avoids the problem of | ||||
194 | # encode_language_tag($lang1) eq and encode_language_tag($lang2) | ||||
195 | # being true if $lang1 and $lang2 are both undef | ||||
196 | |||||
197 | return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; | ||||
198 | } | ||||
199 | |||||
200 | ########################################################################### | ||||
201 | |||||
202 | =item * the function similarity_language_tag($lang1, $lang2) | ||||
203 | |||||
204 | Returns an integer representing the degree of similarity between | ||||
205 | tags $lang1 and $lang2 (the order of which does not matter), where | ||||
206 | similarity is the number of common elements on the left, | ||||
207 | without regard to case and to x/i- alternation. | ||||
208 | |||||
209 | similarity_language_tag('fr', 'fr-ca') is 1 | ||||
210 | (one element in common) | ||||
211 | similarity_language_tag('fr-ca', 'fr-FR') is 1 | ||||
212 | (one element in common) | ||||
213 | |||||
214 | similarity_language_tag('fr-CA-joual', | ||||
215 | 'fr-CA-PEI') is 2 | ||||
216 | similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 | ||||
217 | (two elements in common) | ||||
218 | |||||
219 | similarity_language_tag('x-kadara', 'i-kadara') is 1 | ||||
220 | (x/i- doesn't matter) | ||||
221 | |||||
222 | similarity_language_tag('en', 'x-kadar') is 0 | ||||
223 | similarity_language_tag('x-kadara', 'x-kadar') is 0 | ||||
224 | (unrelated tags -- no similarity) | ||||
225 | |||||
226 | similarity_language_tag('i-cree-syllabic', | ||||
227 | 'i-cherokee-syllabic') is 0 | ||||
228 | (no B<leftmost> elements in common!) | ||||
229 | |||||
230 | =cut | ||||
231 | |||||
232 | sub similarity_language_tag { | ||||
233 | my $lang1 = &encode_language_tag($_[0]); | ||||
234 | my $lang2 = &encode_language_tag($_[1]); | ||||
235 | # And encode_language_tag takes care of the whole | ||||
236 | # no-nyn==nn, i-hakka==zh-hakka, etc, things | ||||
237 | |||||
238 | # NB: (i-sil-...)? (i-sgn-...)? | ||||
239 | |||||
240 | return undef if !defined($lang1) and !defined($lang2); | ||||
241 | return 0 if !defined($lang1) or !defined($lang2); | ||||
242 | |||||
243 | my @l1_subtags = split('-', $lang1); | ||||
244 | my @l2_subtags = split('-', $lang2); | ||||
245 | my $similarity = 0; | ||||
246 | |||||
247 | while(@l1_subtags and @l2_subtags) { | ||||
248 | if(shift(@l1_subtags) eq shift(@l2_subtags)) { | ||||
249 | ++$similarity; | ||||
250 | } else { | ||||
251 | last; | ||||
252 | } | ||||
253 | } | ||||
254 | return $similarity; | ||||
255 | } | ||||
256 | |||||
257 | ########################################################################### | ||||
258 | |||||
259 | =item * the function is_dialect_of($lang1, $lang2) | ||||
260 | |||||
261 | Returns true iff language tag $lang1 represents a subform of | ||||
262 | language tag $lang2. | ||||
263 | |||||
264 | B<Get the order right! It doesn't work the other way around!> | ||||
265 | |||||
266 | is_dialect_of('en-US', 'en') is TRUE | ||||
267 | (American English IS a dialect of all-English) | ||||
268 | |||||
269 | is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE | ||||
270 | is_dialect_of('fr-CA-joual', 'fr') is TRUE | ||||
271 | (Joual is a dialect of (a dialect of) French) | ||||
272 | |||||
273 | is_dialect_of('en', 'en-US') is FALSE | ||||
274 | (all-English is a NOT dialect of American English) | ||||
275 | |||||
276 | is_dialect_of('fr', 'en-CA') is FALSE | ||||
277 | |||||
278 | is_dialect_of('en', 'en' ) is TRUE | ||||
279 | is_dialect_of('en-US', 'en-US') is TRUE | ||||
280 | (B<Note:> these are degenerate cases) | ||||
281 | |||||
282 | is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE | ||||
283 | (the x/i thing doesn't matter, nor does case) | ||||
284 | |||||
285 | is_dialect_of('nn', 'no') is TRUE | ||||
286 | (because 'nn' (New Norse) is aliased to 'no-nyn', | ||||
287 | as a special legacy case, and 'no-nyn' is a | ||||
288 | subform of 'no' (Norwegian)) | ||||
289 | |||||
290 | =cut | ||||
291 | |||||
292 | sub is_dialect_of { | ||||
293 | |||||
294 | my $lang1 = &encode_language_tag($_[0]); | ||||
295 | my $lang2 = &encode_language_tag($_[1]); | ||||
296 | |||||
297 | return undef if !defined($lang1) and !defined($lang2); | ||||
298 | return 0 if !defined($lang1) or !defined($lang2); | ||||
299 | |||||
300 | return 1 if $lang1 eq $lang2; | ||||
301 | return 0 if length($lang1) < length($lang2); | ||||
302 | |||||
303 | $lang1 .= '-'; | ||||
304 | $lang2 .= '-'; | ||||
305 | return | ||||
306 | (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; | ||||
307 | } | ||||
308 | |||||
309 | ########################################################################### | ||||
310 | |||||
311 | =item * the function super_languages($lang1) | ||||
312 | |||||
313 | Returns a list of language tags that are superordinate tags to $lang1 | ||||
314 | -- it gets this by removing subtags from the end of $lang1 until | ||||
315 | nothing (or just "i" or "x") is left. | ||||
316 | |||||
317 | super_languages("fr-CA-joual") is ("fr-CA", "fr") | ||||
318 | |||||
319 | super_languages("en-AU") is ("en") | ||||
320 | |||||
321 | super_languages("en") is empty-list, () | ||||
322 | |||||
323 | super_languages("i-cherokee") is empty-list, () | ||||
324 | ...not ("i"), which would be illegal as well as pointless. | ||||
325 | |||||
326 | If $lang1 is not a valid language tag, returns empty-list in | ||||
327 | a list context, undef in a scalar context. | ||||
328 | |||||
329 | A notable and rather unavoidable problem with this method: | ||||
330 | "x-mingo-tom" has an "x" because the whole tag isn't an | ||||
331 | IANA-registered tag -- but super_languages('x-mingo-tom') is | ||||
332 | ('x-mingo') -- which isn't really right, since 'i-mingo' is | ||||
333 | registered. But this module has no way of knowing that. (But note | ||||
334 | that same_language_tag('x-mingo', 'i-mingo') is TRUE.) | ||||
335 | |||||
336 | More importantly, you assume I<at your peril> that superordinates of | ||||
337 | $lang1 are mutually intelligible with $lang1. Consider this | ||||
338 | carefully. | ||||
339 | |||||
340 | =cut | ||||
341 | |||||
342 | sub super_languages { | ||||
343 | my $lang1 = $_[0]; | ||||
344 | return() unless defined($lang1) && &is_language_tag($lang1); | ||||
345 | |||||
346 | # a hack for those annoying new (2001) tags: | ||||
347 | $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards | ||||
348 | $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards | ||||
349 | $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way | ||||
350 | # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark | ||||
351 | |||||
352 | my @l1_subtags = split('-', $lang1); | ||||
353 | |||||
354 | ## Changes in the language tagging standards may have to be reflected here. | ||||
355 | |||||
356 | # NB: (i-sil-...)? | ||||
357 | |||||
358 | my @supers = (); | ||||
359 | foreach my $bit (@l1_subtags) { | ||||
360 | push @supers, | ||||
361 | scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; | ||||
362 | } | ||||
363 | pop @supers if @supers; | ||||
364 | shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; | ||||
365 | return reverse @supers; | ||||
366 | } | ||||
367 | |||||
368 | ########################################################################### | ||||
369 | |||||
370 | =item * the function locale2language_tag($locale_identifier) | ||||
371 | |||||
372 | This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") | ||||
373 | and maps it to a language tag. If it's not mappable (as with, | ||||
374 | notably, "C" and "POSIX"), this returns empty-list in a list context, | ||||
375 | or undef in a scalar context. | ||||
376 | |||||
377 | locale2language_tag("en") is "en" | ||||
378 | |||||
379 | locale2language_tag("en_US") is "en-US" | ||||
380 | |||||
381 | locale2language_tag("en_US.ISO8859-1") is "en-US" | ||||
382 | |||||
383 | locale2language_tag("C") is undef or () | ||||
384 | |||||
385 | locale2language_tag("POSIX") is undef or () | ||||
386 | |||||
387 | locale2language_tag("POSIX") is undef or () | ||||
388 | |||||
389 | I'm not totally sure that locale names map satisfactorily to language | ||||
390 | tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. | ||||
391 | |||||
392 | The output is untainted. If you don't know what tainting is, | ||||
393 | don't worry about it. | ||||
394 | |||||
395 | =cut | ||||
396 | |||||
397 | sub locale2language_tag { | ||||
398 | my $lang = | ||||
399 | $_[0] =~ m/(.+)/ # to make for an untainted result | ||||
400 | ? $1 : '' | ||||
401 | ; | ||||
402 | |||||
403 | return $lang if &is_language_tag($lang); # like "en" | ||||
404 | |||||
405 | $lang =~ tr<_><->; # "en_US" -> en-US | ||||
406 | $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s; # "en_US.ISO8859-1" -> en-US | ||||
407 | # it_IT.utf8@euro => it-IT | ||||
408 | |||||
409 | return $lang if &is_language_tag($lang); | ||||
410 | |||||
411 | return; | ||||
412 | } | ||||
413 | |||||
414 | ########################################################################### | ||||
415 | |||||
416 | =item * the function encode_language_tag($lang1) | ||||
417 | |||||
418 | This function, if given a language tag, returns an encoding of it such | ||||
419 | that: | ||||
420 | |||||
421 | * tags representing different languages never get the same encoding. | ||||
422 | |||||
423 | * tags representing the same language always get the same encoding. | ||||
424 | |||||
425 | * an encoding of a formally valid language tag always is a string | ||||
426 | value that is defined, has length, and is true if considered as a | ||||
427 | boolean. | ||||
428 | |||||
429 | Note that the encoding itself is B<not> a formally valid language tag. | ||||
430 | Note also that you cannot, currently, go from an encoding back to a | ||||
431 | language tag that it's an encoding of. | ||||
432 | |||||
433 | Note also that you B<must> consider the encoded value as atomic; i.e., | ||||
434 | you should not consider it as anything but an opaque, unanalysable | ||||
435 | string value. (The internals of the encoding method may change in | ||||
436 | future versions, as the language tagging standard changes over time.) | ||||
437 | |||||
438 | C<encode_language_tag> returns undef if given anything other than a | ||||
439 | formally valid language tag. | ||||
440 | |||||
441 | The reason C<encode_language_tag> exists is because different language | ||||
442 | tags may represent the same language; this is normally treatable with | ||||
443 | C<same_language_tag>, but consider this situation: | ||||
444 | |||||
445 | You have a data file that expresses greetings in different languages. | ||||
446 | Its format is "[language tag]=[how to say 'Hello']", like: | ||||
447 | |||||
448 | en-US=Hiho | ||||
449 | fr=Bonjour | ||||
450 | i-mingo=Hau' | ||||
451 | |||||
452 | And suppose you write a program that reads that file and then runs as | ||||
453 | a daemon, answering client requests that specify a language tag and | ||||
454 | then expect the string that says how to greet in that language. So an | ||||
455 | interaction looks like: | ||||
456 | |||||
457 | greeting-client asks: fr | ||||
458 | greeting-server answers: Bonjour | ||||
459 | |||||
460 | So far so good. But suppose the way you're implementing this is: | ||||
461 | |||||
462 | my %greetings; | ||||
463 | die unless open(IN, "<in.dat"); | ||||
464 | while(<IN>) { | ||||
465 | chomp; | ||||
466 | next unless /^([^=]+)=(.+)/s; | ||||
467 | my($lang, $expr) = ($1, $2); | ||||
468 | $greetings{$lang} = $expr; | ||||
469 | } | ||||
470 | close(IN); | ||||
471 | |||||
472 | at which point %greetings has the contents: | ||||
473 | |||||
474 | "en-US" => "Hiho" | ||||
475 | "fr" => "Bonjour" | ||||
476 | "i-mingo" => "Hau'" | ||||
477 | |||||
478 | And suppose then that you answer client requests for language $wanted | ||||
479 | by just looking up $greetings{$wanted}. | ||||
480 | |||||
481 | If the client asks for "fr", that will look up successfully in | ||||
482 | %greetings, to the value "Bonjour". And if the client asks for | ||||
483 | "i-mingo", that will look up successfully in %greetings, to the value | ||||
484 | "Hau'". | ||||
485 | |||||
486 | But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the | ||||
487 | lookup in %greetings fails. That's the Wrong Thing. | ||||
488 | |||||
489 | You could instead do lookups on $wanted with: | ||||
490 | |||||
491 | use I18N::LangTags qw(same_language_tag); | ||||
492 | my $response = ''; | ||||
493 | foreach my $l2 (keys %greetings) { | ||||
494 | if(same_language_tag($wanted, $l2)) { | ||||
495 | $response = $greetings{$l2}; | ||||
496 | last; | ||||
497 | } | ||||
498 | } | ||||
499 | |||||
500 | But that's rather inefficient. A better way to do it is to start your | ||||
501 | program with: | ||||
502 | |||||
503 | use I18N::LangTags qw(encode_language_tag); | ||||
504 | my %greetings; | ||||
505 | die unless open(IN, "<in.dat"); | ||||
506 | while(<IN>) { | ||||
507 | chomp; | ||||
508 | next unless /^([^=]+)=(.+)/s; | ||||
509 | my($lang, $expr) = ($1, $2); | ||||
510 | $greetings{ | ||||
511 | encode_language_tag($lang) | ||||
512 | } = $expr; | ||||
513 | } | ||||
514 | close(IN); | ||||
515 | |||||
516 | and then just answer client requests for language $wanted by just | ||||
517 | looking up | ||||
518 | |||||
519 | $greetings{encode_language_tag($wanted)} | ||||
520 | |||||
521 | And that does the Right Thing. | ||||
522 | |||||
523 | =cut | ||||
524 | |||||
525 | sub encode_language_tag { | ||||
526 | # Only similarity_language_tag() is allowed to analyse encodings! | ||||
527 | |||||
528 | ## Changes in the language tagging standards may have to be reflected here. | ||||
529 | |||||
530 | my($tag) = $_[0] || return undef; | ||||
531 | return undef unless &is_language_tag($tag); | ||||
532 | |||||
533 | # For the moment, these legacy variances are few enough that | ||||
534 | # we can just handle them here with regexps. | ||||
535 | $tag =~ s/^iw\b/he/i; # Hebrew | ||||
536 | $tag =~ s/^in\b/id/i; # Indonesian | ||||
537 | $tag =~ s/^cre\b/cr/i; # Cree | ||||
538 | $tag =~ s/^jw\b/jv/i; # Javanese | ||||
539 | $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger | ||||
540 | $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo | ||||
541 | $tag =~ s/^ji\b/yi/i; # Yiddish | ||||
542 | # SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now, | ||||
543 | # but maybe they're all so obscure I can ignore them. "Obscure" | ||||
544 | # meaning either that the language is obscure, and/or that the | ||||
545 | # XXX form was extant so briefly that it's unlikely it was ever | ||||
546 | # used. I hope. | ||||
547 | # | ||||
548 | # These go FROM the simplex to complex form, to get | ||||
549 | # similarity-comparison right. And that's okay, since | ||||
550 | # similarity_language_tag is the only thing that | ||||
551 | # analyzes our output. | ||||
552 | $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka | ||||
553 | $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal | ||||
554 | $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk | ||||
555 | |||||
556 | $tag =~ s/^[xiXI]-//s; | ||||
557 | # Just lop off any leading "x/i-" | ||||
558 | |||||
559 | return "~" . uc($tag); | ||||
560 | } | ||||
561 | |||||
562 | #-------------------------------------------------------------------------- | ||||
563 | |||||
564 | =item * the function alternate_language_tags($lang1) | ||||
565 | |||||
566 | This function, if given a language tag, returns all language tags that | ||||
567 | are alternate forms of this language tag. (I.e., tags which refer to | ||||
568 | the same language.) This is meant to handle legacy tags caused by | ||||
569 | the minor changes in language tag standards over the years; and | ||||
570 | the x-/i- alternation is also dealt with. | ||||
571 | |||||
572 | Note that this function does I<not> try to equate new (and never-used, | ||||
573 | and unusable) | ||||
574 | ISO639-2 three-letter tags to old (and still in use) ISO639-1 | ||||
575 | two-letter equivalents -- like "ara" -> "ar" -- because | ||||
576 | "ara" has I<never> been in use as an Internet language tag, | ||||
577 | and RFC 3066 stipulates that it never should be, since a shorter | ||||
578 | tag ("ar") exists. | ||||
579 | |||||
580 | Examples: | ||||
581 | |||||
582 | alternate_language_tags('no-bok') is ('nb') | ||||
583 | alternate_language_tags('nb') is ('no-bok') | ||||
584 | alternate_language_tags('he') is ('iw') | ||||
585 | alternate_language_tags('iw') is ('he') | ||||
586 | alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') | ||||
587 | alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') | ||||
588 | alternate_language_tags('en') is () | ||||
589 | alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') | ||||
590 | alternate_language_tags('x-klikitat') is ('i-klikitat') | ||||
591 | alternate_language_tags('i-klikitat') is ('x-klikitat') | ||||
592 | |||||
593 | This function returns empty-list if given anything other than a formally | ||||
594 | valid language tag. | ||||
595 | |||||
596 | =cut | ||||
597 | |||||
598 | 1 | 8µs | my %alt = qw( i x x i I X X I ); | ||
599 | sub alternate_language_tags { | ||||
600 | my $tag = $_[0]; | ||||
601 | return() unless &is_language_tag($tag); | ||||
602 | |||||
603 | my @em; # push 'em real goood! | ||||
604 | |||||
605 | # For the moment, these legacy variances are few enough that | ||||
606 | # we can just handle them here with regexps. | ||||
607 | |||||
608 | if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1"; | ||||
609 | } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1"; | ||||
610 | |||||
611 | } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1"; | ||||
612 | } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1"; | ||||
613 | |||||
614 | } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1"; | ||||
615 | } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1"; | ||||
616 | |||||
617 | } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1"; | ||||
618 | } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1"; | ||||
619 | |||||
620 | } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1"; | ||||
621 | } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1"; | ||||
622 | |||||
623 | } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1"; | ||||
624 | } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1"; | ||||
625 | |||||
626 | } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1"; | ||||
627 | } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1"; | ||||
628 | |||||
629 | } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1"; | ||||
630 | } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1"; | ||||
631 | } | ||||
632 | |||||
633 | push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/; | ||||
634 | return @em; | ||||
635 | } | ||||
636 | |||||
637 | ########################################################################### | ||||
638 | |||||
639 | { | ||||
640 | # Init %Panic... | ||||
641 | |||||
642 | 2 | 38µs | my @panic = ( # MUST all be lowercase! | ||
643 | # Only large ("national") languages make it in this list. | ||||
644 | # If you, as a user, are so bizarre that the /only/ language | ||||
645 | # you claim to accept is Galician, then no, we won't do you | ||||
646 | # the favor of providing Catalan as a panic-fallback for | ||||
647 | # you. Because if I start trying to add "little languages" in | ||||
648 | # here, I'll just go crazy. | ||||
649 | |||||
650 | # Scandinavian lgs. All based on opinion and hearsay. | ||||
651 | 'sv' => [qw(nb no da nn)], | ||||
652 | 'da' => [qw(nb no sv nn)], # I guess | ||||
653 | [qw(no nn nb)], [qw(no nn nb sv da)], | ||||
654 | 'is' => [qw(da sv no nb nn)], | ||||
655 | 'fo' => [qw(da is no nb nn sv)], # I guess | ||||
656 | |||||
657 | # I think this is about the extent of tolerable intelligibility | ||||
658 | # among large modern Romance languages. | ||||
659 | 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French | ||||
660 | 'ca' => [qw(es pt it fr)], | ||||
661 | 'es' => [qw(ca it fr pt)], | ||||
662 | 'it' => [qw(es fr ca pt)], | ||||
663 | 'fr' => [qw(es it ca pt)], | ||||
664 | |||||
665 | # Also assume that speakers of the main Indian languages prefer | ||||
666 | # to read/hear Hindi over English | ||||
667 | [qw( | ||||
668 | as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur | ||||
669 | )] => 'hi', | ||||
670 | # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri, | ||||
671 | # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya, | ||||
672 | # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu. | ||||
673 | 'hi' => [qw(bn pa as or)], | ||||
674 | # I welcome finer data for the other Indian languages. | ||||
675 | # E.g., what should Oriya's list be, besides just Hindi? | ||||
676 | |||||
677 | # And the panic languages for English is, of course, nil! | ||||
678 | |||||
679 | # My guesses at Slavic intelligibility: | ||||
680 | ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian | ||||
681 | 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat | ||||
682 | 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak | ||||
683 | |||||
684 | 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian | ||||
685 | |||||
686 | 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish | ||||
687 | |||||
688 | #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai | ||||
689 | |||||
690 | ); | ||||
691 | 1 | 500ns | my($k,$v); | ||
692 | 1 | 2µs | while(@panic) { | ||
693 | 21 | 30µs | ($k,$v) = splice(@panic,0,2); | ||
694 | 21 | 27µs | foreach my $k (ref($k) ? @$k : $k) { | ||
695 | 41 | 53µs | foreach my $v (ref($v) ? @$v : $v) { | ||
696 | 92 | 200µs | push @{$Panic{$k} ||= []}, $v unless $k eq $v; | ||
697 | } | ||||
698 | } | ||||
699 | } | ||||
700 | } | ||||
701 | |||||
702 | =item * the function @langs = panic_languages(@accept_languages) | ||||
703 | |||||
704 | This function takes a list of 0 or more language | ||||
705 | tags that constitute a given user's Accept-Language list, and | ||||
706 | returns a list of tags for I<other> (non-super) | ||||
707 | languages that are probably acceptable to the user, to be | ||||
708 | used I<if all else fails>. | ||||
709 | |||||
710 | For example, if a user accepts only 'ca' (Catalan) and | ||||
711 | 'es' (Spanish), and the documents/interfaces you have | ||||
712 | available are just in German, Italian, and Chinese, then | ||||
713 | the user will most likely want the Italian one (and not | ||||
714 | the Chinese or German one!), instead of getting | ||||
715 | nothing. So C<panic_languages('ca', 'es')> returns | ||||
716 | a list containing 'it' (Italian). | ||||
717 | |||||
718 | English ('en') is I<always> in the return list, but | ||||
719 | whether it's at the very end or not depends | ||||
720 | on the input languages. This function works by consulting | ||||
721 | an internal table that stipulates what common | ||||
722 | languages are "close" to each other. | ||||
723 | |||||
724 | A useful construct you might consider using is: | ||||
725 | |||||
726 | @fallbacks = super_languages(@accept_languages); | ||||
727 | push @fallbacks, panic_languages( | ||||
728 | @accept_languages, @fallbacks, | ||||
729 | ); | ||||
730 | |||||
731 | =cut | ||||
732 | |||||
733 | sub panic_languages { | ||||
734 | # When in panic or in doubt, run in circles, scream, and shout! | ||||
735 | my(@out, %seen); | ||||
736 | foreach my $t (@_) { | ||||
737 | next unless $t; | ||||
738 | next if $seen{$t}++; # so we don't return it or hit it again | ||||
739 | # push @out, super_languages($t); # nah, keep that separate | ||||
740 | push @out, @{ $Panic{lc $t} || next }; | ||||
741 | } | ||||
742 | return grep !$seen{$_}++, @out, 'en'; | ||||
743 | } | ||||
744 | |||||
745 | #--------------------------------------------------------------------------- | ||||
746 | #--------------------------------------------------------------------------- | ||||
747 | |||||
748 | =item * the function implicate_supers( ...languages... ) | ||||
749 | |||||
750 | This takes a list of strings (which are presumed to be language-tags; | ||||
751 | strings that aren't, are ignored); and after each one, this function | ||||
752 | inserts super-ordinate forms that don't already appear in the list. | ||||
753 | The original list, plus these insertions, is returned. | ||||
754 | |||||
755 | In other words, it takes this: | ||||
756 | |||||
757 | pt-br de-DE en-US fr pt-br-janeiro | ||||
758 | |||||
759 | and returns this: | ||||
760 | |||||
761 | pt-br pt de-DE de en-US en fr pt-br-janeiro | ||||
762 | |||||
763 | This function is most useful in the idiom | ||||
764 | |||||
765 | implicate_supers( I18N::LangTags::Detect::detect() ); | ||||
766 | |||||
767 | (See L<I18N::LangTags::Detect>.) | ||||
768 | |||||
769 | |||||
770 | =item * the function implicate_supers_strictly( ...languages... ) | ||||
771 | |||||
772 | This works like C<implicate_supers> except that the implicated | ||||
773 | forms are added to the end of the return list. | ||||
774 | |||||
775 | In other words, implicate_supers_strictly takes a list of strings | ||||
776 | (which are presumed to be language-tags; strings that aren't, are | ||||
777 | ignored) and after the whole given list, it inserts the super-ordinate forms | ||||
778 | of all given tags, minus any tags that already appear in the input list. | ||||
779 | |||||
780 | In other words, it takes this: | ||||
781 | |||||
782 | pt-br de-DE en-US fr pt-br-janeiro | ||||
783 | |||||
784 | and returns this: | ||||
785 | |||||
786 | pt-br de-DE en-US fr pt-br-janeiro pt de en | ||||
787 | |||||
788 | The reason this function has "_strictly" in its name is that when | ||||
789 | you're processing an Accept-Language list according to the RFCs, if | ||||
790 | you interpret the RFCs quite strictly, then you would use | ||||
791 | implicate_supers_strictly, but for normal use (i.e., common-sense use, | ||||
792 | as far as I'm concerned) you'd use implicate_supers. | ||||
793 | |||||
794 | =cut | ||||
795 | |||||
796 | sub implicate_supers { | ||||
797 | my @languages = grep is_language_tag($_), @_; | ||||
798 | my %seen_encoded; | ||||
799 | foreach my $lang (@languages) { | ||||
800 | $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1 | ||||
801 | } | ||||
802 | |||||
803 | my(@output_languages); | ||||
804 | foreach my $lang (@languages) { | ||||
805 | push @output_languages, $lang; | ||||
806 | foreach my $s ( I18N::LangTags::super_languages($lang) ) { | ||||
807 | # Note that super_languages returns the longest first. | ||||
808 | last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) }; | ||||
809 | push @output_languages, $s; | ||||
810 | } | ||||
811 | } | ||||
812 | return uniq( @output_languages ); | ||||
813 | |||||
814 | } | ||||
815 | |||||
816 | sub implicate_supers_strictly { | ||||
817 | my @tags = grep is_language_tag($_), @_; | ||||
818 | return uniq( @_, map super_languages($_), @_ ); | ||||
819 | } | ||||
820 | |||||
- - | |||||
823 | ########################################################################### | ||||
824 | 1 | 43µs | 1; | ||
825 | __END__ |