#!/usr/bin/perl -w # Script to create a collation file for use in # International Components for Unicode (ICU) # # Keith Stribley # Distributable under the same terms as either the ICU license or the CLDR. # # This collation implements Myanmar Spelling Book Order. # Note: Pali order would be much easier to implement, but # is no longer used. # This collation is very inefficent because it uses # a huge number of collating elements to get the right # sort order # This is the only way that I can see of doing it within the # limitations of the locale file format. However, if you know # a better way please let me know. # # Installation: # type # perl genMyCollateICU.pl > my.txt # Copy my.txt into the source/data/coll directory of ICU, add my.txt to # colfiles.mk in the same directory and build ICU local $consonants; local $vowels; local $tones; local $yrwh; local $killed; # Myanmar sorting is done by clusters # The sorting order is: # 1. Base consonant # 2. YRWH medials # 3. killed consonant # 4. vowel # 5. tone # Most of these are easy because the sorting order is the same as the # storage order for 1,2 and 5. However, 4 is stored before 3 so all # permutations of 3 and 4 need to be defined as collating elements # and listed between 2 and 5. # # Burmese consonants that may take U+103A @consonants = ( ("\\u1000"),#KA ("\\u1001"),#KHA, ("\\u1002"),#GA "\\u1003",#GHA ("\\u1004",),#NGA ("\\u1005"),#CA "\\u1006",#CHA ("\\u1007"),#JA, ("\\u1008"),#JHA, ("\\u1009"),#NYA "\\u100A",#NNYA "\\u100B",#TTA "\\u100C",#TTHA "\\u100D",#DDA "\\u100E",#DDHA ("\\u100F"),#NNA "\\u1010",#TA "\\u1011",#THA ("\\u1012"),#DA "\\u1013",#DHA ("\\u1014"),#NA "\\u1015",#PA ("\\u1016"),#PHA ("\\u1017"),#BA "\\u1018",#BHA "\\u1019",#MA "\\u101A",#YA "\\u101B",#RA "\\u101C",#LA "\\u101D",#WA ("\\u101E"),#SA, ("\\u101F"),#HA ("\\u1020"),#LLA ("\\u1021") ); @yrwh = ( "", "\\u105E",#mon medial na "\\u105F",#mon medial ma "\\u103B", "\\u103C", "\\u1060",#mon medial la "\\u103D", "\\u1082",# shan medial wa "\\u103E", "\\u103B\\u103D", "\\u103C\\u103D", "\\u103B\\u103E", "\\u103C\\u103E", "\\u103D\\u103E", "\\u103B\\u103D\\u103E", "\\u103C\\u103D\\u103E" ); @vowels = ( "", "\\u102C",#aa "\\u1083",#shan aa "\\u1072",#kayah oe "\\u102D",#i "\\u1071",#Geba Karen i "\\u102E",#ii "\\u1033",#mon ii "\\u1067",#w pwo eu "\\u1068",#w pwo ue "\\u102F",#u "\\u1073",#kayah u "\\u1062",#Sgaw eu "\\u1074",#kayah ee "\\u1030",#uu "\\u1056",#vocalic r "\\u1057",#vocalic rr "\\u1058",#vocalic l "\\u1059",#vocalic ll "\\u1031",#e "\\u1084",#shan e "\\u1035",# e above "\\u1085",# shan e above "\\u1032",#ai "\\u1031\\u102C",# aw High tone "\\u1031\\u102C\\u103A",#aw low tone "\\u1034",#mon o "\\u1036",# an "\\u102D\\u102F",#o "\\u1063",#Sgaw Hathi "\\u1086"#shan final y ); @vowelsTakeFinal = ( "", "\\u102C",#1 "\\u102D",#2 "\\u102F",#4 "\\u1031",#6 e.g. foreign words "\\u1031\\u102C",#8 High tone "\\u102D\\u102F"#13 ); @killed = ( "\\u1000", "\\u1001", "\\u1002", "\\u1003", "\\u1004", "\\u1005", "\\u1006", "\\u1007", "\\u1009", "\\u100A", "\\u100B", "\\u100C", "\\u100D", "\\u100E", "\\u100F", "\\u1010", "\\u1011", "\\u1012", "\\u1013", "\\u1014", "\\u1015", "\\u1016", "\\u1017", "\\u1018", "\\u1019", "\\u101A", "\\u101B", "\\u101C", "\\u101E", "\\u101F", "" ); # the following includes tones from several scripts, the order chosen here # groups tones of the same number together @tones = ( "", "\\u1069",#w-pwo-1 "\\u1037",#dot-below "\\u1087",#shan-2 "\\u108B",#shan-council-2 "\\u106A",#w-pwo-2 "\\u1062\\u103A",#sgaw karen "\\u1088",#shan-3 "\\u108C",#shan-council-3 "\\u106B",#w-pwo-3 "\\u102C\\u103A",#sgaw karen "\\u1038",#visarga "\\u106C",#w-pwo-4 "\\u1037\\u1038",#sgaw karen "\\u1089",#shan-5 "\\u106D",#w-pwo-5 "\\u1063\\u103A",#sgaw karen "\\u108A",#shan-6 "\\u108D",#shan-council-emphatic "\\u108F",#palaung tone 6 "\\u1064"#sgaw karen ); # start file print << 'EOT'; // Myanmar Collation for ICU // Developed by www.ThanLwinSoft.org // Distributable under the same terms as the ICU license. // This file is generated by genMyCollateICU.pl - do not hand edit my { Version { "1.0" } collations{ standard{ Sequence{ [normalization on] EOT # define collating elements printf("/* Tones */\n"); # Tones are just secondary, tertiary in UCA, but this gives the wrong results # when you have a following syllable e.g. consider # 1000 102C # 1000 102C 1000 # 1000 102C 1038 # Where should the Burmese tones be inserted? # 108C is consistent with Shan which also uses 1038 (see http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3277.pdf page 4) # but it isn't very important if you only collate Burmese so long as the # \u1037, \u1038 relative order is present print << 'EOT'; "&\u108C" "<\u1037" "<\u1038" "<\u1037\u1038" /* Vowels */ "&\u102C" "<<\u102B" "&\u1032" "<\u1031\u102C" "<<\u1031\u102B" "<\u1031\u102C\u103A" "<<\u1031\u102B\u103A" "&\u1034" "<\u1036" "<\u102D\u102F" EOT printf("/* Vowels with finals */\n"); # The final takes precedence over the vowel, but occurs after it, hence they # all need to be listed explicitly. $iKilled = 0; # vowels with killed consonant for ($c = 0; $c<= $#consonants; $c++) { $consonant = $consonants[$c]; if ($killed[$iKilled] eq $consonant) { #printf STDERR "$consonant\n"; $iKilled++; if ($consonant eq "\\u1004") # kinzi { printf("\"<%s\\u103A\\u1039\"\n",$consonant); } else { printf("\"<%s\\u1039\"\n",$consonant); } printf("\"<<%s\\u103A\"\n",$consonant); for ($v = 1; $v<=$#vowelsTakeFinal; $v++) { $theVowel = $vowelsTakeFinal[$v]; if ($consonant eq "\\u1004") # kinzi { printf("\"<%s%s\\u103A\\u1039\"\n",$vowelsTakeFinal[$v],$consonant); } else { printf("\"<%s%s\\u1039\"\n",$vowelsTakeFinal[$v],$consonant); } printf("\"<<%s%s\\u103A\"\n",$vowelsTakeFinal[$v],$consonant); if ($theVowel =~ s/\\u102C/\\u102B/) { if ($consonant eq "\\u1004") # kinzi { printf("\"<<%s%s\\u103A\\u1039\"\n",$theVowel,$consonant); } else { printf("\"<<%s%s\\u1039\"\n",$theVowel,$consonant); } printf("\"<<%s%s\\u103A\"\n",$theVowel,$consonant); } } } else { printf("\"<%s\\u103A\"\n",$consonant); for ($v = 1; $v<=$#vowelsTakeFinal; $v++) { $theVowel = $vowelsTakeFinal[$v]; printf("\"<%s%s\\u103A\"\n",$vowelsTakeFinal[$v],$consonant); if ($theVowel =~ s/\\u102C/\\u102B/) { printf("\"<<%s%s\\u103A\"\n",$theVowel,$consonant); } } } } printf("/* Medials */\n"); for ($y = 1; $y<=$#yrwh; $y++) { if ($y == 1) { printf("\"<%s\"\n", $yrwh[$y]); } else { printf("\"<%s\"\n", $yrwh[$y]); } } printf("\n"); # It is debatable whether to make the \u102D\u1036 and \u102F\u1036 difference # from the \u1019 equivalent primary or secondary # primary gives a closer match with the Myanmar/English dictionary # secondary gives a better match with the preface to the Myanmar spelling dictionary # It is hard to produce a better match because you need to include 1037, 1038, # which distorts the other rules, probably due to triggering unwanted expansions. print << 'EOT'; /* Independent vowels */ "&\u1021\u102d<<<\u1023" "&\u1021\u102e<<<\u1024" "&\u1021\u102f<<<\u1025" "&\u1021\u1030<<<\u1026=\u1025\u102e" "&\u1021\u1031<<<\u1027<<<\u1028" "&\u1021\u1031\u102c<<<\u1029" "&\u1021\u1031\u102c\u103A<<<\u102A" "&\u102D\u1019\u103A<\u102D\u1036<\u102D\u1019\u103A\u1037<\u102D\u1036\u1037" "<\u102D\u1019\u103A\u1038<\u102D\u1036\u1038" "&\u102F\u1019\u103A<\u102F\u1036<\u102F\u1019\u103A\u1037<\u102F\u1036\u1037" "<\u102F\u1019\u103A\u1038<\u102F\u1036\u1038" "&\u1021\u102F\u1036<<<\u1025\u102F\u1036" /* Independent vowels with finals */ EOT # combinations of finals with independent vowels / finals for ($k = 0; $k<$#killed; $k++) { printf("\"&\\u1021\\u102d%s\\u1039<<<\\u1023%s\\u1039\"\n", $killed[$k], $killed[$k]); printf("\"&\\u1021\\u102d%s\\u103A<<<\\u1023%s\\u103A\"\n", $killed[$k], $killed[$k]); printf("\"&\\u1021\\u102f%s\\u1039<<<\\u1025%s\\u1039\"\n", $killed[$k], $killed[$k]); printf("\"&\\u1021\\u102f%s\\u103A<<<\\u1025%s\\u103A\"\n", $killed[$k], $killed[$k]); printf("\"&\\u1021\\u1031%s\\u1039<<<\\u1027%s\\u1039\"\n", $killed[$k], $killed[$k]); printf("\"&\\u1021\\u1031%s\\u103A<<<\\u1027%s\\u103A\"\n", $killed[$k], $killed[$k]); printf("\"&\\u1021\\u1031\\u102C%s\\u1039<<<\\u1029%s\\u1039\"\n", $killed[$k], $killed[$k]); printf("\"&\\u1021\\u1031\\u102C%s\\u103A<<<\\u1029%s\\u103A\"\n", $killed[$k], $killed[$k]); # I think the next one only occurs in the word ဩောင်း printf("\"&\\u1021\\u1031\\u102C%s\\u1039<<<\\u1029\\u1031\\u102C%s\\u1039\"\n", $killed[$k], $killed[$k]); printf("\"&\\u1021\\u1031\\u102C%s\\u103A<<<\\u1029\\u1031\\u102C%s\\u103A\"\n", $killed[$k], $killed[$k]); } print << 'EOT'; EOT print << 'EOT'; /* Contractions */ "&\u1031\u102C\u1000\u103A\u1000\u103B=\u1031\u102C\u1000\u103A\u103B"/* (suffix of) man */ "&\u1014\u103A\u1014\u102F\u1015\u103A=\u1014\u103A\u102F\u1015\u103A" /* (suffix of) I */ /* Great Sa */ "&\u1031\u101E\u1039\u101E=\u1031\u103F" "&\u102D\u101E\u1039\u101E=\u102D\u103F" "&\u102F\u101E\u1039\u101E=\u102F\u103F" "&\u1021\u102D\u101E\u1039\u101E<<<\u1023\u103F" "&\u1021\u102F\u101E\u1039\u101E<<<\u1025\u103F" "&\u101E\u1039\u101E=\u103F" /* Symbols - collate as long form */ "&\u1014\u103E\u102D\u102F\u1000\u103A<<\u104C" "&\u101B\u103D\u1031\u1037<<\u104D" "&\u101C\u100A\u103A\u1038\u1000\u1031\u102C\u1004\u103A\u1038<<\u104E\u1004\u103A\u1038" "&\u1021\u102D<<\u104F" /* Short Forms may need to be added here */ "&\u101C\u1000\u103A\u101A\u102C=\u101C\u1000\u103A\u103B\u102C" /* right hand side */ "&\u101E\u1019\u102E=\u101E\u1039\u1019\u102E" /* daughter */ "&\u1011\u1019\u1004\u103A\u1038=\u1011\u1039\u1019\u1004\u103A\u1038" /* cooked rice */ "&\u101C\u1000\u103A\u1018\u1000\u103A=\u101C\u1039\u1018\u1000\u103A" /*tea*/ } Version{"1.0"} } } } EOT