ILIAS  release_5-2 Revision v5.2.25-18-g3f80b828510
UtfNormalGenerate.php
Go to the documentation of this file.
1<?php
2# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3# http://www.mediawiki.org/
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 2 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License along
16# with this program; if not, write to the Free Software Foundation, Inc.,
17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18# http://www.gnu.org/copyleft/gpl.html
19
30if( php_sapi_name() != 'cli' ) {
31 die( "Run me from the command line please.\n" );
32}
33
34require_once 'include/Unicode/UtfNormalUtil.php';
35
36$in = fopen("DerivedNormalizationProps.txt", "rt" );
37if( !$in ) {
38 print "Can't open DerivedNormalizationProps.txt for reading.\n";
39 print "If necessary, fetch this file from the internet:\n";
40 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
41 exit(-1);
42}
43print "Initializing normalization quick check tables...\n";
44$checkNFC = array();
45while( false !== ($line = fgets( $in ) ) ) {
46 $matches = array();
47 if( preg_match( '/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', $line, $matches ) ) {
48 list( $junk, $first, $last, $prop, $value ) = $matches;
49 #print "$first $last $prop $value\n";
50 if( !$last ) $last = $first;
51 for( $i = hexdec( $first ); $i <= hexdec( $last ); $i++) {
52 $char = codepointToUtf8( $i );
53 $checkNFC[$char] = $value;
54 }
55 }
56}
57fclose( $in );
58
59$in = fopen("CompositionExclusions.txt", "rt" );
60if( !$in ) {
61 print "Can't open CompositionExclusions.txt for reading.\n";
62 print "If necessary, fetch this file from the internet:\n";
63 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
64 exit(-1);
65}
66$exclude = array();
67while( false !== ($line = fgets( $in ) ) ) {
68 if( preg_match( '/^([0-9A-F]+)/i', $line, $matches ) ) {
69 $codepoint = $matches[1];
70 $source = codepointToUtf8( hexdec( $codepoint ) );
71 $exclude[$source] = true;
72 }
73}
74fclose($in);
75
76$in = fopen("UnicodeData.txt", "rt" );
77if( !$in ) {
78 print "Can't open UnicodeData.txt for reading.\n";
79 print "If necessary, fetch this file from the internet:\n";
80 print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
81 exit(-1);
82}
83
91
92print "Reading character definitions...\n";
93while( false !== ($line = fgets( $in ) ) ) {
94 $columns = split(';', $line);
95 $codepoint = $columns[0];
96 $name = $columns[1];
97 $canonicalCombiningClass = $columns[3];
98 $decompositionMapping = $columns[5];
99
100 $source = codepointToUtf8( hexdec( $codepoint ) );
101
102 if( $canonicalCombiningClass != 0 ) {
103 $combiningClass[$source] = intval( $canonicalCombiningClass );
104 }
105
106 if( $decompositionMapping === '' ) continue;
107 if( preg_match( '/^<(.+)> (.*)$/', $decompositionMapping, $matches ) ) {
108 # Compatibility decomposition
109 $canonical = false;
110 $decompositionMapping = $matches[2];
111 $compat++;
112 } else {
113 $canonical = true;
114 $canon++;
115 }
116 $total++;
117 $dest = hexSequenceToUtf8( $decompositionMapping );
118
119 $compatibilityDecomp[$source] = $dest;
120 if( $canonical ) {
121 $canonicalDecomp[$source] = $dest;
122 if( empty( $exclude[$source] ) ) {
123 $canonicalComp[$dest] = $source;
124 }
125 }
126 #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
127}
128fclose( $in );
129
130print "Recursively expanding canonical mappings...\n";
133while( $changed > 0 ) {
134 print "pass $pass\n";
135 $changed = 0;
136 foreach( $canonicalDecomp as $source => $dest ) {
137 $newDest = preg_replace_callback(
138 '/([\xc0-\xff][\x80-\xbf]+)/',
139 'callbackCanonical',
140 $dest);
141 if( $newDest === $dest ) continue;
142 $changed++;
143 $canonicalDecomp[$source] = $newDest;
144 }
145 $pass++;
146}
147
148print "Recursively expanding compatibility mappings...\n";
149$changed = 42;
150$pass = 1;
151while( $changed > 0 ) {
152 print "pass $pass\n";
153 $changed = 0;
154 foreach( $compatibilityDecomp as $source => $dest ) {
155 $newDest = preg_replace_callback(
156 '/([\xc0-\xff][\x80-\xbf]+)/',
157 'callbackCompat',
158 $dest);
159 if( $newDest === $dest ) continue;
160 $changed++;
161 $compatibilityDecomp[$source] = $newDest;
162 }
163 $pass++;
164}
165
166print "$total decomposition mappings ($canon canonical, $compat compatibility)\n";
167
168$out = fopen("UtfNormalData.inc", "wt");
169if( $out ) {
170 $serCombining = escapeSingleString( serialize( $combiningClass ) );
171 $serComp = escapeSingleString( serialize( $canonicalComp ) );
172 $serCanon = escapeSingleString( serialize( $canonicalDecomp ) );
173 $serCheckNFC = escapeSingleString( serialize( $checkNFC ) );
174 $outdata = "<" . "?php
180global \$utfCombiningClass, \$utfCanonicalComp, \$utfCanonicalDecomp, \$utfCheckNFC;
181\$utfCombiningClass = unserialize( '$serCombining' );
182\$utfCanonicalComp = unserialize( '$serComp' );
183\$utfCanonicalDecomp = unserialize( '$serCanon' );
184\$utfCheckNFC = unserialize( '$serCheckNFC' );
185?" . ">\n";
186 fputs( $out, $outdata );
187 fclose( $out );
188 print "Wrote out UtfNormalData.inc\n";
189} else {
190 print "Can't create file UtfNormalData.inc\n";
191 exit(-1);
192}
193
194
195$out = fopen("UtfNormalDataK.inc", "wt");
196if( $out ) {
197 $serCompat = escapeSingleString( serialize( $compatibilityDecomp ) );
198 $outdata = "<" . "?php
204global \$utfCompatibilityDecomp;
205\$utfCompatibilityDecomp = unserialize( '$serCompat' );
206?" . ">\n";
207 fputs( $out, $outdata );
208 fclose( $out );
209 print "Wrote out UtfNormalDataK.inc\n";
210 exit(0);
211} else {
212 print "Can't create file UtfNormalDataK.inc\n";
213 exit(-1);
214}
215
216# ---------------
217
218function callbackCanonical( $matches ) {
219 global $canonicalDecomp;
220 if( isset( $canonicalDecomp[$matches[1]] ) ) {
221 return $canonicalDecomp[$matches[1]];
222 }
223 return $matches[1];
224}
225
226function callbackCompat( $matches ) {
228 if( isset( $compatibilityDecomp[$matches[1]] ) ) {
229 return $compatibilityDecomp[$matches[1]];
230 }
231 return $matches[1];
232}
233
234?>
if(! $in) $columns
Definition: Utf8Test.php:45
$canonicalDecomp
if(! $in) print
callbackCanonical( $matches)
$canonicalComp
if(! $in) $exclude
callbackCompat( $matches)
if(! $in) $compatibilityDecomp
$combiningClass
escapeSingleString( $string)
Escape a string for inclusion in a PHP single-quoted string literal.
hexSequenceToUtf8( $sequence)
Take a series of space-separated hexadecimal numbers representing Unicode code points and return a UT...
codepointToUtf8( $codepoint)
Return UTF-8 sequence for a given Unicode code point.
An exception for terminatinating execution or to throw for unit testing.