-
Notifications
You must be signed in to change notification settings - Fork 5
/
load-unicode-data.tex
194 lines (194 loc) · 6.89 KB
/
load-unicode-data.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
% File load-unicode-data.tex
%
% Copyright 2015-2023 The LaTeX Project
%
% It may be distributed and/or modified under the conditions of
% the LaTeX Project Public License (LPPL), either version 1.3c of
% this license or (at your option) any later version. The latest
% version of this license is in the file
% http://www.latex-project.org/lppl.txt.
%
% Issues with this file should be reported at
% https://github.com/latex3/unicode-data
%
% This file parses a number of data files provided by the Unicode Consortium
% and when used with used Unicode-capable engine sets up a range of TeX-related
% parameters based on the extracted information.
%
% From the file UnicodeData.txt the following properties are set:
% - \catcode 11 for all letters (Unicode class "L")
% - \catcode 11 for all combining marks (Unicode class "M")
% - \sfcode 999 for all code points of class "Lu" (upper case letters)
% - \lccode for all of class "Ll" (lower case letters) to the code point
% itself, and \uccode to the upper case mapping (or if not given
% to the code point itself)
% - \uccode for all of class "Lu" (upper case letters) to the code point
% itself, and \lccode to the lower case mapping (or if not given
% to the code point itself)
% - \lccode and \uccode for all of class "Lt" (title case letters) to the
% lower and upper case mappings (or if not given to the code point itself)
% - \lccode and \uccode for all other letter code points are set to
% the code point itself
% - \lccode and/or \uccode for non-letter code points for which an upper
% or lower case mapping is given
% - \sfcode 0 (ignored) for code points of Unicode classes "Pe" (closing
% punctuation marks) and "Pf" (final quotation marks)
% - \Umathcode for all letters as math type 7 (var)
%
% =============================================================================
%
% The data can only be loaded by Unicode engines. Currently this is limited to
% XeTeX and LuaTeX, both of which define \Umathcode.
\ifx\Umathcode\undefined
\expandafter\endinput
\fi
% Just in case, check for the e-TeX extensions.
\ifx\eTeXversion\undefined
\expandafter\endinput
\fi
% This file can be loaded in IniTeX mode so the category codes of |{|, |}| and
% |#| may not be correct. Everything is done in a group so that only the
% settings we want to propagate are made available generally.
\begingroup
\catcode`\{=1 %
\catcode`\}=2 %
\catcode`\#=6 %
% Write some basic information to the log.
\catcode`\^=7 %
\newlinechar=`\^^J %
\message{^^J}%
\message{load-unicode-data.tex v1.17 (2023-09-18)^^J}%
\message{Reading Unicode data^^J}%
% The first stage of parsing is dealing with the fact that there are lots of
% data items separated by |;|. Of those, only a few are needed so they are
% picked out and everything else is dropped. There is one complication: there
% are a few cases in the data file of ranges which are marked by the descriptor
% |First| and a matching |Last|. A separate routine is used to handle these
% cases.
\def\parseunicodedataI#1;#2;#3;#4;#5;#6;#7;#8;#9;{%
\parseunicodedataII#1;#3;#2 First>\relax
}%
\def\parseunicodedataII#1;#2;#3 First>#4\relax{%
\ifx\relax#4\relax
\expandafter\parseunicodedataIII
\else
\expandafter\parseunicodedataVII
\fi
#1;#2;%
}%
\def\parseunicodedataIII#1;#2;#3;#4;#5;#6;#7;#8\relax{%
\parseunicodedataIV{#1}{#2}{#6}{#7}%
}%
% At this stage we have a `normal' data line with four pieces of information:
% the code point, the Unicode class and the (possibly empty) upper and lower
% case mappings. A few utility macros are defined, then we branch based on the
% Unicode class. Notice that for all letter-like code points we first set the
% |\lccode| and |\uccode| values to the code point itself then test for the
% classes where a different setting might be appropriate. For non-letters
% there is a check to see if any mappings are available, and also for trailing
% punctuation to set the appropriate |\sfcode|.
\def\Ll{Ll}%
\def\Lt{Lt}%
\def\Lu{Lu}%
\def\Pe{Pe}%
\def\Pf{Pf}%
\def\firsttoken#1#2\relax{#1}%
\def\parseunicodedataIV#1#2#3#4{%
\ifnum 0%
\if L\firsttoken#2?\relax 1\fi
\if M\firsttoken#2?\relax 1\fi
>0 %
\parseunicodedataV{"#1}%
\def\temp{#2}%
\ifx\Ll\temp
\parseunicodedataVI\uccode{#1}{#3}%
\fi
\ifx\Lt\temp
\parseunicodedataVI\uccode{#1}{#3}%
\parseunicodedataVI\lccode{#1}{#4}%
\fi
\ifx\Lu\temp
\parseunicodedataVI\lccode{#1}{#4}%
\global\sfcode"#1=999 %
\fi
% All letters in math mode should be variables.
\global\Umathcode"#1="7"01"#1 %
\else
\def\temp{#2}%
\ifnum 0\ifx\temp\Pe 1\fi\ifx\temp\Pf 1\fi>0 %
\global\sfcode"#1=0 %
\fi
\ifx\relax#3\relax
\else
\global\uccode"#1="#3 %
\fi
\ifx\relax#4\relax
\else
\global\lccode"#1="#4 %
\fi
\fi
}%
% A simple auxiliary for all letter-like code points: the |\lccode| and
% |\uccode| may get reset for cased letters but this means the initial
% setting can't be forgotten.
\def\parseunicodedataV#1{%
\global\catcode#1=11 %
\global\lccode#1=#1 %
\global\uccode#1=#1 %
}%
% An auxiliary to deal with the fact that some cased letters don't actually
% have a case mapping available.
\def\parseunicodedataVI#1#2#3{%
\ifx\relax#3\relax
\else
\global#1"#2="#3 %
\fi
}%
% For lines that were the |First>| of a range, read the data source again for
% last line. Lines for letters then trigger a loop over the entire range. These
% are always non-cased letters.
\def\parseunicodedataVII#1;#2;#3\relax{%
\read0 to \unicodedataline
\expandafter\parseunicodedataXII\unicodedataline\relax#1;#2\relax
}%
\def\parseunicodedataXII#1;#2\relax#3;#4\relax{%
\if L\firsttoken#4?\relax
\begingroup
\count0="#3 %
\loop
\unless\ifnum\count0>"#1 %
\parseunicodedataV{\count0 }%
\advance\count0 by 1 %
\repeat
\endgroup
\fi
}%
% From plain: may not be defined (yet).
\def\loop#1\repeat{\def\body{#1}\iterate}%
\def\iterate{%
\body
\let\next\iterate
\else
\let\next\relax
\fi
\next
}%
\let\repeat\fi
% There is no version data in |UnicodeData.txt|: log that it is being used with
% a hard-coded date (the modification date from ftp.unicode.org). This obviously
% needs to be updated when a new download takes place!
\message{\string# UnicodeData-15.1.0.txt^^J}%
\message{\string# Modified 2023-09-18 08:45:00 GMT [JAW]^^J}%
% Actually loading the file requires an input stream, done directly.
% There is a blank line at the end of the data source so there is a check
% here for a |\par|.
\def\storedpar{\par}%
\openin0=UnicodeData.txt %
\loop\unless\ifeof0 %
\read0 to \unicodedataline
\unless\ifx\unicodedataline\storedpar
\expandafter\parseunicodedataI\unicodedataline\relax
\fi
\repeat
\closein0 %
\endgroup