ILIAS  release_5-3 Revision v5.3.23-19-g915713cf615
UtfNormalGenerate.php
Go to the documentation of this file.
1 <?php
2 # Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3 # http://www.mediawiki.org/
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 # http://www.gnu.org/copyleft/gpl.html
19 
30 if (php_sapi_name() != 'cli') {
31  die("Run me from the command line please.\n");
32 }
33 
34 require_once 'include/Unicode/UtfNormalUtil.php';
35 
36 $in = fopen("DerivedNormalizationProps.txt", "rt");
37 if (!$in) {
38  print "Can't open DerivedNormalizationProps.txt for reading.\n";
39  print "If necessary, fetch this file from the internet:\n";
40  print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
41  exit(-1);
42 }
43 print "Initializing normalization quick check tables...\n";
45 while (false !== ($line = fgets($in))) {
46  $matches = array();
47  if (preg_match('/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', $line, $matches)) {
48  list($junk, $first, $last, $prop, $value) = $matches;
49  #print "$first $last $prop $value\n";
50  if (!$last) {
51  $last = $first;
52  }
53  for ($i = hexdec($first); $i <= hexdec($last); $i++) {
54  $char = codepointToUtf8($i);
55  $checkNFC[$char] = $value;
56  }
57  }
58 }
59 fclose($in);
60 
61 $in = fopen("CompositionExclusions.txt", "rt");
62 if (!$in) {
63  print "Can't open CompositionExclusions.txt for reading.\n";
64  print "If necessary, fetch this file from the internet:\n";
65  print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
66  exit(-1);
67 }
69 while (false !== ($line = fgets($in))) {
70  if (preg_match('/^([0-9A-F]+)/i', $line, $matches)) {
71  $codepoint = $matches[1];
72  $source = codepointToUtf8(hexdec($codepoint));
73  $exclude[$source] = true;
74  }
75 }
76 fclose($in);
77 
78 $in = fopen("UnicodeData.txt", "rt");
79 if (!$in) {
80  print "Can't open UnicodeData.txt for reading.\n";
81  print "If necessary, fetch this file from the internet:\n";
82  print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
83  exit(-1);
84 }
85 
90 $total = 0;
91 $compat = 0;
92 $canon = 0;
93 
94 print "Reading character definitions...\n";
95 while (false !== ($line = fgets($in))) {
96  $columns = split(';', $line);
97  $codepoint = $columns[0];
98  $name = $columns[1];
99  $canonicalCombiningClass = $columns[3];
100  $decompositionMapping = $columns[5];
101 
102  $source = codepointToUtf8(hexdec($codepoint));
103 
104  if ($canonicalCombiningClass != 0) {
105  $combiningClass[$source] = intval($canonicalCombiningClass);
106  }
107 
108  if ($decompositionMapping === '') {
109  continue;
110  }
111  if (preg_match('/^<(.+)> (.*)$/', $decompositionMapping, $matches)) {
112  # Compatibility decomposition
113  $canonical = false;
114  $decompositionMapping = $matches[2];
115  $compat++;
116  } else {
117  $canonical = true;
118  $canon++;
119  }
120  $total++;
121  $dest = hexSequenceToUtf8($decompositionMapping);
122 
123  $compatibilityDecomp[$source] = $dest;
124  if ($canonical) {
125  $canonicalDecomp[$source] = $dest;
126  if (empty($exclude[$source])) {
127  $canonicalComp[$dest] = $source;
128  }
129  }
130  #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
131 }
132 fclose($in);
133 
134 print "Recursively expanding canonical mappings...\n";
135 $changed = 42;
136 $pass = 1;
137 while ($changed > 0) {
138  print "pass $pass\n";
139  $changed = 0;
140  foreach ($canonicalDecomp as $source => $dest) {
141  $newDest = preg_replace_callback(
142  '/([\xc0-\xff][\x80-\xbf]+)/',
143  'callbackCanonical',
144  $dest
145  );
146  if ($newDest === $dest) {
147  continue;
148  }
149  $changed++;
150  $canonicalDecomp[$source] = $newDest;
151  }
152  $pass++;
153 }
154 
155 print "Recursively expanding compatibility mappings...\n";
156 $changed = 42;
157 $pass = 1;
158 while ($changed > 0) {
159  print "pass $pass\n";
160  $changed = 0;
161  foreach ($compatibilityDecomp as $source => $dest) {
162  $newDest = preg_replace_callback(
163  '/([\xc0-\xff][\x80-\xbf]+)/',
164  'callbackCompat',
165  $dest
166  );
167  if ($newDest === $dest) {
168  continue;
169  }
170  $changed++;
171  $compatibilityDecomp[$source] = $newDest;
172  }
173  $pass++;
174 }
175 
176 print "$total decomposition mappings ($canon canonical, $compat compatibility)\n";
177 
178 $out = fopen("UtfNormalData.inc", "wt");
179 if ($out) {
180  $serCombining = escapeSingleString(serialize($combiningClass));
181  $serComp = escapeSingleString(serialize($canonicalComp));
182  $serCanon = escapeSingleString(serialize($canonicalDecomp));
183  $serCheckNFC = escapeSingleString(serialize($checkNFC));
184  $outdata = "<" . "?php
190 global \$utfCombiningClass, \$utfCanonicalComp, \$utfCanonicalDecomp, \$utfCheckNFC;
191 \$utfCombiningClass = unserialize( '$serCombining' );
192 \$utfCanonicalComp = unserialize( '$serComp' );
193 \$utfCanonicalDecomp = unserialize( '$serCanon' );
194 \$utfCheckNFC = unserialize( '$serCheckNFC' );
195 ?" . ">\n";
196  fputs($out, $outdata);
197  fclose($out);
198  print "Wrote out UtfNormalData.inc\n";
199 } else {
200  print "Can't create file UtfNormalData.inc\n";
201  exit(-1);
202 }
203 
204 
205 $out = fopen("UtfNormalDataK.inc", "wt");
206 if ($out) {
207  $serCompat = escapeSingleString(serialize($compatibilityDecomp));
208  $outdata = "<" . "?php
214 global \$utfCompatibilityDecomp;
215 \$utfCompatibilityDecomp = unserialize( '$serCompat' );
216 ?" . ">\n";
217  fputs($out, $outdata);
218  fclose($out);
219  print "Wrote out UtfNormalDataK.inc\n";
220  exit(0);
221 } else {
222  print "Can't create file UtfNormalDataK.inc\n";
223  exit(-1);
224 }
225 
226 # ---------------
227 
228 function callbackCanonical($matches)
229 {
230  global $canonicalDecomp;
231  if (isset($canonicalDecomp[$matches[1]])) {
232  return $canonicalDecomp[$matches[1]];
233  }
234  return $matches[1];
235 }
236 
237 function callbackCompat($matches)
238 {
239  global $compatibilityDecomp;
240  if (isset($compatibilityDecomp[$matches[1]])) {
241  return $compatibilityDecomp[$matches[1]];
242  }
243  return $matches[1];
244 }
if(! $in) $compatibilityDecomp
callbackCanonical($matches)
$combiningClass
$canonicalComp
if($format !==null) $name
Definition: metadata.php:146
$canonicalDecomp
if(! $in) $exclude
Create styles array
The data for the language used.
callbackCompat($matches)
escapeSingleString($string)
Escape a string for inclusion in a PHP single-quoted string literal.
$i
Definition: disco.tpl.php:19
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
$source
Definition: linkback.php:22
if(! $in) $columns
Definition: Utf8Test.php:45
hexSequenceToUtf8($sequence)
Take a series of space-separated hexadecimal numbers representing Unicode code points and return a UT...