diff --git a/lib/LaTeXML/Common/Font.pm b/lib/LaTeXML/Common/Font.pm index f9f907287..76191349f 100644 --- a/lib/LaTeXML/Common/Font.pm +++ b/lib/LaTeXML/Common/Font.pm @@ -60,44 +60,53 @@ my $FLAG_EMPH = 0x10; # NOTE: This probably doesn't really belong in here... my %font_family = ( - cmr => { family => 'serif' }, cmss => { family => 'sansserif' }, - cmtt => { family => 'typewriter' }, cmvtt => { family => 'typewriter' }, - cmt => { family => 'serif' }, # for cmti "text italic" - cmfib => { family => 'serif' }, cmfr => { family => 'serif' }, - cmdh => { family => 'serif' }, cm => { family => 'serif' }, - ptm => { family => 'serif' }, ppl => { family => 'serif' }, - pnc => { family => 'serif' }, pbk => { family => 'serif' }, - phv => { family => 'sansserif' }, pag => { family => 'serif' }, - pcr => { family => 'typewriter' }, pzc => { family => 'script' }, - put => { family => 'serif' }, bch => { family => 'serif' }, - psy => { family => 'symbol' }, pzd => { family => 'dingbats' }, - ccr => { family => 'serif' }, ccy => { family => 'symbol' }, - cmbr => { family => 'sansserif' }, cmtl => { family => 'typewriter' }, - cmbrs => { family => 'symbol' }, ul9 => { family => 'typewriter' }, - txr => { family => 'serif' }, txss => { family => 'sansserif' }, - txtt => { family => 'typewriter' }, txms => { family => 'symbol' }, - txsya => { family => 'symbol' }, txsyb => { family => 'symbol' }, - pxr => { family => 'serif' }, pxms => { family => 'symbol' }, - pxsya => { family => 'symbol' }, pxsyb => { family => 'symbol' }, - futs => { family => 'serif' }, - uaq => { family => 'serif' }, ugq => { family => 'sansserif' }, - eur => { family => 'serif' }, eus => { family => 'script' }, - euf => { family => 'fraktur' }, euex => { family => 'symbol' }, + cmr => { family => 'serif' }, + cmss => { family => 'sansserif' }, + cmssq => { family => 'sansserif' }, # quote style? + cmssqi => { family => 'sansserif', shape => 'italic' }, # quote style? + cmtt => { family => 'typewriter' }, cmvtt => { family => 'typewriter' }, + cmt => { family => 'serif' }, # for cmti "text italic" + cmfib => { family => 'serif' }, + cmfr => { family => 'serif' }, + cm => { family => 'serif' }, + cmdh => { family => 'serif' }, + cmr => { family => 'serif' }, + cmdunh => { family => 'serif' }, # like cmr10 but with tall body heights + cmu => { family => 'serif' }, # unslanted italic ?? + ptm => { family => 'serif' }, ppl => { family => 'serif' }, + pnc => { family => 'serif' }, pbk => { family => 'serif' }, + phv => { family => 'sansserif' }, pag => { family => 'serif' }, + pcr => { family => 'typewriter' }, pzc => { family => 'script' }, + put => { family => 'serif' }, bch => { family => 'serif' }, + psy => { family => 'symbol' }, pzd => { family => 'dingbats' }, + ccr => { family => 'serif' }, ccy => { family => 'symbol' }, + cmbr => { family => 'sansserif' }, cmtl => { family => 'typewriter' }, + cmbrs => { family => 'symbol' }, ul9 => { family => 'typewriter' }, + txr => { family => 'serif' }, txss => { family => 'sansserif' }, + txtt => { family => 'typewriter' }, txms => { family => 'symbol' }, + txsya => { family => 'symbol' }, txsyb => { family => 'symbol' }, + pxr => { family => 'serif' }, pxms => { family => 'symbol' }, + pxsya => { family => 'symbol' }, pxsyb => { family => 'symbol' }, + futs => { family => 'serif' }, + uaq => { family => 'serif' }, ugq => { family => 'sansserif' }, + eur => { family => 'serif' }, eus => { family => 'script' }, + euf => { family => 'fraktur' }, euex => { family => 'symbol' }, # The following are actually math fonts. - ms => { family => 'symbol' }, - ccm => { family => 'serif', shape => 'italic' }, - cmm => { family => 'italic', encoding => 'OML' }, - cmex => { family => 'symbol', encoding => 'OMX' }, # Not really symbol, but... - cmsy => { family => 'symbol', encoding => 'OMS' }, - ccitt => { family => 'typewriter', shape => 'italic' }, - cmbrm => { family => 'sansserif', shape => 'italic' }, - futm => { family => 'serif', shape => 'italic' }, - futmi => { family => 'serif', shape => 'italic' }, - txmi => { family => 'serif', shape => 'italic' }, - pxmi => { family => 'serif', shape => 'italic' }, - bbm => { family => 'blackboard' }, - bbold => { family => 'blackboard' }, - bbmss => { family => 'blackboard' }, + ms => { family => 'symbol' }, + ccm => { family => 'serif', shape => 'italic' }, + cmm => { family => 'math', shape => 'italic', encoding => 'OML' }, + cmex => { family => 'symbol', encoding => 'OMX' }, # Not really symbol, but... + cmsy => { family => 'symbol', encoding => 'OMS' }, + ccitt => { family => 'typewriter', shape => 'italic' }, + cmsltt => { family => 'typewriter', shape => 'slanted' }, + cmbrm => { family => 'sansserif', shape => 'italic' }, + futm => { family => 'serif', shape => 'italic' }, + futmi => { family => 'serif', shape => 'italic' }, + txmi => { family => 'serif', shape => 'italic' }, + pxmi => { family => 'serif', shape => 'italic' }, + bbm => { family => 'blackboard' }, + bbold => { family => 'blackboard' }, + bbmss => { family => 'blackboard' }, # some ams fonts cmmib => { family => 'italic', series => 'bold' }, cmbsy => { family => 'symbol', series => 'bold' }, @@ -177,16 +186,17 @@ sub decodeFontname { if (my $ffam = lookupFontFamily($fam)) { map { $props{$_} = $$ffam{$_} } keys %$ffam; } if (my $fser = lookupFontSeries($ser)) { map { $props{$_} = $$fser{$_} } keys %$fser; } if (my $fsh = lookupFontShape($shp)) { map { $props{$_} = $$fsh{$_} } keys %$fsh; } - $size = 1 unless $size; # Yes, also if 0, "" (from regexp) - $size = $at if defined $at; - $size *= $scaled if defined $scaled; + $size = 1 unless $size; # Yes, also if 0, "" (from regexp) + $size = $at if defined $at; + $size = $size * $scaled if defined $scaled; + $props{name} = $name; $props{size} = $size; # Experimental Hack !?!?!? $props{encoding} = 'OT1' unless defined $props{encoding}; - $props{at} = $at . "pt" if defined $at; return %props; } else { - return; } } + Info('unrecognized', 'font', undef, "Unrecognized fontname '$name'"); + return (family => $name, size => DEFSIZE()); } } sub lookupTeXFont { my ($fontname, $seriescode, $shapecode) = @_; @@ -266,6 +276,7 @@ sub stringify { no warnings 'recursion'; my ($self) = @_; my ($fam, $ser, $shp, $siz, $col, $bkg, $opa, $enc, $lang, $mstyle, $flags) = @$self; + # !!!!! $fam = 'serif' if $fam && ($fam eq 'math'); return 'Font[' . join(',', map { Stringify($_) } grep { $_ } (isDiff($fam, $DEFFAMILY) ? ($fam) : ()), @@ -280,6 +291,16 @@ sub stringify { ) . ']'; } +# Return a Fontinfo-like hash +# Eventually a more integrated representation of Fonts that accommodates +# both low-level TeX-like commands, and higher-level CSS-like ones. +sub asFontinfo { + my ($self) = @_; + my ($fam, $ser, $shp, $siz, $col, $bkg, $opa, $enc, $lang, $mstyle, $flags) = @$self; + return { family => $fam, series => $ser, shape => $shp, size => $siz, + color => $col, background => $bkg, opacity => $opa, + encoding => $enc || 'OT1', language => $lang, mathstyle => $mstyle }; } + sub equals { my ($self, $other) = @_; return (defined $other) && ((ref $self) eq (ref $other)) @@ -328,6 +349,7 @@ sub relativeTo { my ($self, $other) = @_; my ($fam, $ser, $shp, $siz, $col, $bkg, $opa, $enc, $lang, $mstyle, $flags) = @$self; my ($ofam, $oser, $oshp, $osiz, $ocol, $obkg, $oopa, $oenc, $olang, $omstyle, $oflags) = @$other; + # !!!! $fam = 'serif' if $fam && ($fam eq 'math'); $ofam = 'serif' if $ofam && ($ofam eq 'math'); ## my $emph = 0; @@ -358,6 +380,9 @@ sub relativeTo { (isDiff($opa, $oopa) ? (opacity => { value => $opa, properties => { opacity => $opa } }) : ()), + (isDiff($enc, $oenc) + ? (encoding => { value => $enc, properties => { encoding => $enc } }) + : ()), (isDiff($lang, $olang) ? ('xml:lang' => { value => $lang, properties => { language => $lang } }) : ()), diff --git a/lib/LaTeXML/Core/Definition/CharDef.pm b/lib/LaTeXML/Core/Definition/CharDef.pm index 69cd4de1e..d992ba8ec 100644 --- a/lib/LaTeXML/Core/Definition/CharDef.pm +++ b/lib/LaTeXML/Core/Definition/CharDef.pm @@ -47,13 +47,17 @@ sub invoke { my $mathglyph = $$self{mathglyph}; # A dilemma: If the \chardef were in a style file, you're prefer to revert to the $cs # but if defined in the document source, better to use \char ###\relax, so it still "works" - if (defined $mathglyph) { # Must be a math char + my $src = $$self{locator} && $$self{locator}->toString; + my $local = $src && $src !~ /\.(?:sty|ltxml|ltxmlc)/; # Dumps currently have undefined src! + if (defined $mathglyph) { # Must be a math char return Box($mathglyph, undef, undef, - Tokens(T_CS('\mathchar'), $value->revert, T_CS('\relax')), + ($local ? Tokens(T_CS('\mathchar'), $value->revert, T_CS('\relax')) : $$self{cs}), role => $$self{role}); } - else { # else text; but note defered font/encoding till digestion! - return Box(LaTeXML::Package::FontDecode($value->valueOf), undef, undef, - Tokens(T_CS('\char'), $value->revert, T_CS('\relax'))); } } + else { # else text; but note defered font/encoding till digestion! + my ($char, %props) = LaTeXML::Package::FontDecode($value->valueOf); + return Box($char, undef, undef, + ($local ? Tokens(T_CS('\char'), $value->revert, T_CS('\relax')) : $$self{cs}), + %props); } } sub equals { my ($self, $other) = @_; diff --git a/lib/LaTeXML/Core/State.pm b/lib/LaTeXML/Core/State.pm index 7dbf31c6d..6fb15c8de 100644 --- a/lib/LaTeXML/Core/State.pm +++ b/lib/LaTeXML/Core/State.pm @@ -124,13 +124,12 @@ sub new { $$self{delcode} = {}; $$self{tracing_definitions} = {}; # Initializations that INITEX would have set. - $$self{mathcode}{'.'} = [0]; for (my $c = ord('0') ; $c <= ord('9') ; $c++) { - $$self{mathcode}{ chr($c) } = [0x7000]; } + $$self{mathcode}{ chr($c) } = [0x7000 + $c]; } for (my $c = ord('a') ; $c <= ord('z') ; $c++) { my $C = $c + ord('A') - ord('a'); - $$self{mathcode}{ chr($c) } = [0x7100]; - $$self{mathcode}{ chr($C) } = [0x7100]; + $$self{mathcode}{ chr($c) } = [0x7100 + $c]; + $$self{mathcode}{ chr($C) } = [0x7100 + $C]; $$self{uccode}{ chr($c) } = [$C]; $$self{lccode}{ chr($C) } = [$c]; $$self{sfcode}{ chr($C) } = [999]; } diff --git a/lib/LaTeXML/Engine/LaTeX.pool.ltxml b/lib/LaTeXML/Engine/LaTeX.pool.ltxml index 652d832d1..9dfa64d81 100644 --- a/lib/LaTeXML/Engine/LaTeX.pool.ltxml +++ b/lib/LaTeXML/Engine/LaTeX.pool.ltxml @@ -4889,6 +4889,7 @@ DefConstructor('\@framebox[Dimension][]{}', $document->setAttribute($c[0], $k => $v); } } } } ); +AssignValue(allocated_boxes => 0, 'global'); DefPrimitive('\newsavebox DefToken', sub { my $n = LookupValue('allocated_boxes') + 1; AssignValue(allocated_boxes => $n, 'global'); diff --git a/lib/LaTeXML/Engine/TeX_Character.pool.ltxml b/lib/LaTeXML/Engine/TeX_Character.pool.ltxml index c73691022..8eaeafa93 100644 --- a/lib/LaTeXML/Engine/TeX_Character.pool.ltxml +++ b/lib/LaTeXML/Engine/TeX_Character.pool.ltxml @@ -185,7 +185,7 @@ DefRegister('\catcode Number', Number(0), # Not used anywhere (yet) DefRegister('\sfcode Number', Number(0), getter => sub { my $code = $STATE->lookupSFcode(chr($_[0]->valueOf)); - Number(defined $code ? $code : 0); }, + Number(defined $code ? $code : 1000); }, setter => sub { $STATE->assignSFcode(chr($_[2]->valueOf) => $_[0]->valueOf, $_[1]); }); DefRegister('\lccode Number', Number(0), getter => sub { my $code = $STATE->lookupLCcode(chr($_[0]->valueOf)); diff --git a/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml b/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml index 75aad44ad..9f3d82b21 100644 --- a/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml +++ b/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml @@ -285,5 +285,38 @@ DeclareFontMap('OMX', # [missing tips for horizontal curly braces] "\x{2191}", "\x{2193}", undef, undef, undef, undef, "\x{21D1}", "\x{21D3}"]); +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +# TeX's ligatures handled by rewrite regexps. +# Note: applied in reverse order of definition (latest defined applied first!) +# Note also, these area only applied in text content, not in attributes! +sub nonTypewriter { + my ($font) = @_; + return ($font->getFamily ne 'typewriter'); } + +sub nonTypewriterT1 { + my ($font) = @_; + return ($font->getFamily ne 'typewriter') && (($font->getEncoding || 'OT1') =~ /^(OT1|T1)$/); } + +# EN DASH (NOTE: With digits before & aft => \N{FIGURE DASH}) +DefLigature(qr{--}, "\x{2013}", fontTest => \&nonTypewriter); # EN dash +DefLigature(qr{---}, "\x{2014}", fontTest => \&nonTypewriter); # EM dash + +# Ligatures for doubled single left & right quotes to convert to double quotes +# [should ligatures be part of a font, in the first place? (it is in TeX!) +DefLigature(qr{\x{2018}\x{2018}}, "\x{201C}", fontTest => \&nonTypewriterT1); # double left quote +DefLigature(qr{\x{2019}\x{2019}}, "\x{201D}", fontTest => \&nonTypewriterT1); # double right quote +DefLigature(qr{\?\x{2018}}, UTF(0xBF), fontTest => \&nonTypewriterT1); # ? backquote +DefLigature(qr{!\x{2018}}, UTF(0xA1), fontTest => \&nonTypewriterT1); # ! backquote +# These ligatures are also handled by TeX. +# However, it appears that decent modern fonts in modern browsers handle these at that level. +# So it's likely not worth doing it at the conversion level, possibly adversely affecting search. +# DefLigature(qr{ff}, "\x{FB00}", fontTest => \&nonTypewriterT1); +# DefLigature(qr{fi}, "\x{FB01}", fontTest => \&nonTypewriterT1); +# DefLigature(qr{fl}, "\x{FB02}", fontTest => \&nonTypewriterT1); +# DefLigature(qr{ffi}, "\x{FB03}", fontTest => \&nonTypewriterT1); +# DefLigature(qr{ffl}, "\x{FB04}", fontTest => \&nonTypewriterT1); + +DefLigature(qr{\.\.\.}, "\x{2026}", fontTest => \&nonTypewriter); # ldots + #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1; diff --git a/lib/LaTeXML/Engine/TeX_Math.pool.ltxml b/lib/LaTeXML/Engine/TeX_Math.pool.ltxml index 85cde23b2..b6c0f6437 100644 --- a/lib/LaTeXML/Engine/TeX_Math.pool.ltxml +++ b/lib/LaTeXML/Engine/TeX_Math.pool.ltxml @@ -1193,5 +1193,14 @@ DefConstructor('\lx@eqno{}', "^ #1", reversion => ''); +#====================================================================== +# Pretest for XMath to keep from interpreting math that the DOM may not allow!! +##DefMathRewrite(xpath=>'descendant-or-self::ltx:XMath',match=>'\cdot\cdot\cdot',replace=>'\cdots'); + +DefMathLigature("\x{22C5}\x{22C5}\x{22C5}" => "\x{22EF}", role => 'ID', name => 'cdots'); + +#DefMathRewrite(xpath=>'descendant-or-self::ltx:XMath',match=>'...',replace=>'\ldots'); +DefMathLigature("..." => "\x{2026}", role => 'ID', name => 'ldots'); + #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 1; diff --git a/lib/LaTeXML/Engine/TeX_Paragraph.pool.ltxml b/lib/LaTeXML/Engine/TeX_Paragraph.pool.ltxml index 6807ba5a9..884e79c22 100644 --- a/lib/LaTeXML/Engine/TeX_Paragraph.pool.ltxml +++ b/lib/LaTeXML/Engine/TeX_Paragraph.pool.ltxml @@ -68,6 +68,19 @@ DefConstructorI('\noindent', undef, sub { # Otherwise ignore. return; }); +sub alignLine { + my ($document, $line, $alignment) = @_; + if ($document->isOpenable('ltx:p')) { + $document->insertElement('ltx:p', $line, class => 'ltx_align_' . $alignment); } + elsif ($document->isOpenable('ltx:text')) { + $document->insertElement('ltx:text', $line, class => 'ltx_align_' . $alignment); + $document->insertElement('ltx:break'); } + else { + Info('unexpected', 'alignment', $document, + "Lost requested alignment '$alignment'; no suitable element"); + $document->absorb($line); } + return; } + # represents a Logical Paragraph, whereas is a `physical paragraph'. # A para can contain both p and displayed equations and such. diff --git a/lib/LaTeXML/Engine/plain.pool.ltxml b/lib/LaTeXML/Engine/plain.pool.ltxml index 036ed3c92..4047a73b1 100644 --- a/lib/LaTeXML/Engine/plain.pool.ltxml +++ b/lib/LaTeXML/Engine/plain.pool.ltxml @@ -95,39 +95,9 @@ DefMathRewrite(xpath => 'descendant-or-self::ltx:XMWrap[' }); #====================================================================== -# TeX's ligatures handled by rewrite regexps. -# Note: applied in reverse order of definition (latest defined applied first!) -# Note also, these area only applied in text content, not in attributes! DefPrimitive('\@@endash', sub { Box("\x{2013}", undef, undef, T_CS('\@@endash')); }); DefPrimitive('\@@emdash', sub { Box("\x{2014}", undef, undef, T_CS('\@@emdash')); }); -sub nonTypewriter { - my ($font) = @_; - return ($font->getFamily ne 'typewriter'); } - -sub nonTypewriterT1 { - my ($font) = @_; - return ($font->getFamily ne 'typewriter') && (($font->getEncoding || 'OT1') =~ /^(OT1|T1)$/); } - -# EN DASH (NOTE: With digits before & aft => \N{FIGURE DASH}) -DefLigature(qr{--}, "\x{2013}", fontTest => \&nonTypewriter); # EN dash -DefLigature(qr{---}, "\x{2014}", fontTest => \&nonTypewriter); # EM dash - -# Ligatures for doubled single left & right quotes to convert to double quotes -# [should ligatures be part of a font, in the first place? (it is in TeX!) -DefLigature(qr{\x{2018}\x{2018}}, "\x{201C}", fontTest => \&nonTypewriterT1); # double left quote -DefLigature(qr{\x{2019}\x{2019}}, "\x{201D}", fontTest => \&nonTypewriterT1); # double right quote -DefLigature(qr{\?\x{2018}}, UTF(0xBF), fontTest => \&nonTypewriterT1); # ? backquote -DefLigature(qr{!\x{2018}}, UTF(0xA1), fontTest => \&nonTypewriterT1); # ! backquote -# These ligatures are also handled by TeX. -# However, it appears that decent modern fonts in modern browsers handle these at that level. -# So it's likely not worth doing it at the conversion level, possibly adversely affecting search. -# DefLigature(qr{ff}, "\x{FB00}", fontTest => \&nonTypewriterT1); -# DefLigature(qr{fi}, "\x{FB01}", fontTest => \&nonTypewriterT1); -# DefLigature(qr{fl}, "\x{FB02}", fontTest => \&nonTypewriterT1); -# DefLigature(qr{ffi}, "\x{FB03}", fontTest => \&nonTypewriterT1); -# DefLigature(qr{ffl}, "\x{FB04}", fontTest => \&nonTypewriterT1); - DefConstructor('\TeX', "