ILIAS  release_5-4 Revision v5.4.26-12-gabc799a52e6
UtfNormalGenerate.php
Go to the documentation of this file.
1<?php
2# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3# http://www.mediawiki.org/
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 2 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License along
16# with this program; if not, write to the Free Software Foundation, Inc.,
17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18# http://www.gnu.org/copyleft/gpl.html
19
30if (php_sapi_name() != 'cli') {
31 die("Run me from the command line please.\n");
32}
33
34require_once 'include/Unicode/UtfNormalUtil.php';
35
36$in = fopen("DerivedNormalizationProps.txt", "rt");
37if (!$in) {
38 print "Can't open DerivedNormalizationProps.txt for reading.\n";
39 print "If necessary, fetch this file from the internet:\n";
40 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
41 exit(-1);
42}
43print "Initializing normalization quick check tables...\n";
44$checkNFC = array();
45while (false !== ($line = fgets($in))) {
46 $matches = array();
47 if (preg_match('/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', $line, $matches)) {
48 list($junk, $first, $last, $prop, $value) = $matches;
49 #print "$first $last $prop $value\n";
50 if (!$last) {
51 $last = $first;
52 }
53 for ($i = hexdec($first); $i <= hexdec($last); $i++) {
54 $char = codepointToUtf8($i);
55 $checkNFC[$char] = $value;
56 }
57 }
58}
59fclose($in);
60
61$in = fopen("CompositionExclusions.txt", "rt");
62if (!$in) {
63 print "Can't open CompositionExclusions.txt for reading.\n";
64 print "If necessary, fetch this file from the internet:\n";
65 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
66 exit(-1);
67}
68$exclude = array();
69while (false !== ($line = fgets($in))) {
70 if (preg_match('/^([0-9A-F]+)/i', $line, $matches)) {
71 $codepoint = $matches[1];
72 $source = codepointToUtf8(hexdec($codepoint));
73 $exclude[$source] = true;
74 }
75}
76fclose($in);
77
78$in = fopen("UnicodeData.txt", "rt");
79if (!$in) {
80 print "Can't open UnicodeData.txt for reading.\n";
81 print "If necessary, fetch this file from the internet:\n";
82 print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
83 exit(-1);
84}
85
93
94print "Reading character definitions...\n";
95while (false !== ($line = fgets($in))) {
96 $columns = split(';', $line);
97 $codepoint = $columns[0];
98 $name = $columns[1];
99 $canonicalCombiningClass = $columns[3];
100 $decompositionMapping = $columns[5];
101
102 $source = codepointToUtf8(hexdec($codepoint));
103
104 if ($canonicalCombiningClass != 0) {
105 $combiningClass[$source] = intval($canonicalCombiningClass);
106 }
107
108 if ($decompositionMapping === '') {
109 continue;
110 }
111 if (preg_match('/^<(.+)> (.*)$/', $decompositionMapping, $matches)) {
112 # Compatibility decomposition
113 $canonical = false;
114 $decompositionMapping = $matches[2];
115 $compat++;
116 } else {
117 $canonical = true;
118 $canon++;
119 }
120 $total++;
121 $dest = hexSequenceToUtf8($decompositionMapping);
122
124 if ($canonical) {
125 $canonicalDecomp[$source] = $dest;
126 if (empty($exclude[$source])) {
127 $canonicalComp[$dest] = $source;
128 }
129 }
130 #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
131}
132fclose($in);
133
134print "Recursively expanding canonical mappings...\n";
137while ($changed > 0) {
138 print "pass $pass\n";
139 $changed = 0;
140 foreach ($canonicalDecomp as $source => $dest) {
141 $newDest = preg_replace_callback(
142 '/([\xc0-\xff][\x80-\xbf]+)/',
143 'callbackCanonical',
144 $dest
145 );
146 if ($newDest === $dest) {
147 continue;
148 }
149 $changed++;
150 $canonicalDecomp[$source] = $newDest;
151 }
152 $pass++;
153}
154
155print "Recursively expanding compatibility mappings...\n";
156$changed = 42;
157$pass = 1;
158while ($changed > 0) {
159 print "pass $pass\n";
160 $changed = 0;
161 foreach ($compatibilityDecomp as $source => $dest) {
162 $newDest = preg_replace_callback(
163 '/([\xc0-\xff][\x80-\xbf]+)/',
164 'callbackCompat',
165 $dest
166 );
167 if ($newDest === $dest) {
168 continue;
169 }
170 $changed++;
171 $compatibilityDecomp[$source] = $newDest;
172 }
173 $pass++;
174}
175
176print "$total decomposition mappings ($canon canonical, $compat compatibility)\n";
177
178$out = fopen("UtfNormalData.inc", "wt");
179if ($out) {
180 $serCombining = escapeSingleString(serialize($combiningClass));
181 $serComp = escapeSingleString(serialize($canonicalComp));
182 $serCanon = escapeSingleString(serialize($canonicalDecomp));
183 $serCheckNFC = escapeSingleString(serialize($checkNFC));
184 $outdata = "<" . "?php
190global \$utfCombiningClass, \$utfCanonicalComp, \$utfCanonicalDecomp, \$utfCheckNFC;
191\$utfCombiningClass = unserialize( '$serCombining' );
192\$utfCanonicalComp = unserialize( '$serComp' );
193\$utfCanonicalDecomp = unserialize( '$serCanon' );
194\$utfCheckNFC = unserialize( '$serCheckNFC' );
195?" . ">\n";
196 fputs($out, $outdata);
197 fclose($out);
198 print "Wrote out UtfNormalData.inc\n";
199} else {
200 print "Can't create file UtfNormalData.inc\n";
201 exit(-1);
202}
203
204
205$out = fopen("UtfNormalDataK.inc", "wt");
206if ($out) {
207 $serCompat = escapeSingleString(serialize($compatibilityDecomp));
208 $outdata = "<" . "?php
214global \$utfCompatibilityDecomp;
215\$utfCompatibilityDecomp = unserialize( '$serCompat' );
216?" . ">\n";
217 fputs($out, $outdata);
218 fclose($out);
219 print "Wrote out UtfNormalDataK.inc\n";
220 exit(0);
221} else {
222 print "Can't create file UtfNormalDataK.inc\n";
223 exit(-1);
224}
225
226# ---------------
227
228function callbackCanonical($matches)
229{
230 global $canonicalDecomp;
231 if (isset($canonicalDecomp[$matches[1]])) {
232 return $canonicalDecomp[$matches[1]];
233 }
234 return $matches[1];
235}
236
237function callbackCompat($matches)
238{
240 if (isset($compatibilityDecomp[$matches[1]])) {
241 return $compatibilityDecomp[$matches[1]];
242 }
243 return $matches[1];
244}
if(! $in) $columns
Definition: Utf8Test.php:45
$canonicalDecomp
callbackCompat($matches)
$canonicalComp
callbackCanonical($matches)
if(! $in) $exclude
if(! $in) $compatibilityDecomp
$combiningClass
if(! $in) print
escapeSingleString($string)
Escape a string for inclusion in a PHP single-quoted string literal.
codepointToUtf8($codepoint)
Return UTF-8 sequence for a given Unicode code point.
hexSequenceToUtf8($sequence)
Take a series of space-separated hexadecimal numbers representing Unicode code points and return a UT...
$source
Definition: linkback.php:22
exit
Definition: backend.php:16
An exception for terminatinating execution or to throw for unit testing.
$i
Definition: disco.tpl.php:19
split($path)
Returns the 'dirname' and 'basename' for a path.
Definition: functions.php:279