From 98dae2fba3eb37bad0e11253d119c3493cf7a43c Mon Sep 17 00:00:00 2001 From: David Warring Date: Tue, 24 Dec 2024 06:25:05 +1300 Subject: [PATCH] Better support various em space characters --- lib/PDF/Content/Text/Box.rakumod | 34 +++++++++++++++++++---------- lib/PDF/Content/Text/Line.rakumod | 11 ++++++---- lib/PDF/Content/Text/Style.rakumod | 10 ++++----- t/pdf-text-align.pdf | Bin 12556 -> 12556 bytes t/pdf-text-hyphenation.pdf | Bin 12799 -> 12799 bytes t/pdf-text-indent.pdf | Bin 5952 -> 5952 bytes t/pdf-text-style.pdf | Bin 8292 -> 8292 bytes t/pdf-text-verbatim.pdf | Bin 6766 -> 6766 bytes t/text-box-images.pdf | Bin 3677 -> 3677 bytes t/text-box.pdf | Bin 1760 -> 2211 bytes t/text-box.t | 13 ++++++++++- 11 files changed, 47 insertions(+), 21 deletions(-) diff --git a/lib/PDF/Content/Text/Box.rakumod b/lib/PDF/Content/Text/Box.rakumod index 5cecd02..969fe3d 100644 --- a/lib/PDF/Content/Text/Box.rakumod +++ b/lib/PDF/Content/Text/Box.rakumod @@ -136,7 +136,7 @@ method content-height returns Numeric { @!linesĀ».height.sum * $.leading; } my grammar Text { token nbsp { <[ \c[NO-BREAK SPACE] \c[NARROW NO-BREAK SPACE] \c[WORD JOINER] ]> } - token space { [\s > | "\c[ZERO WIDTH SPACE]"]+ } + token space { [\s > | \c[ZERO WIDTH SPACE] ]+ } token hyphen { <[ \c[HYPHEN] \c[HYPHEN-MINUS] \c[HYPHENATION POINT] ]> } token word { [ . ]+ <[ \c[HYPHEN] \c[HYPHEN-MINUS] ]>? | <.hyphen> } } @@ -218,7 +218,8 @@ method !layup(@atoms is copy) { my Int $i = 0; my Int $line-start = 0; my Int $n = +@atoms; - my UInt $preceding-spaces = self!flush-spaces: @atoms, $i; + my $em-spaces = self!word-gap($!style.scale: 1000) / self!word-gap; + my Numeric $preceding-spaces = self!flush-spaces: $em-spaces, @atoms, $i; my $word-gap := self!word-gap; my $height := $!style.font-size; my Numeric $hyphen-width; @@ -317,7 +318,7 @@ method !layup(@atoms is copy) { if $height > $line.height; $prev-soft-hyphen = $soft-hyphen; - $preceding-spaces = self!flush-spaces(@atoms, $i); + $preceding-spaces = self!flush-spaces($em-spaces, @atoms, $i); } if $preceding-spaces { @@ -360,20 +361,31 @@ method !height-exceeded { $!height && self.content-height > $!height; } -method !flush-spaces(@words is raw, $i is rw) returns UInt { +my constant %SpaceWidth = %( + "\c[EN SPACE]" => .5, + "\c[EM SPACE]" => 1, + "\c[THREE-PER-EM SPACE]" => 3, + "\c[FOUR-PER-EM SPACE]" => 4, + "\c[SIX-PER-EM SPACE]" => 6, + "\c[THIN SPACE]" => .2, + "\c[HAIR SPACE]" => .1, + "\c[ZERO WIDTH SPACE]" => 0, +); + +method !flush-spaces($em-spaces is rw, @words is raw, $i is rw) returns Numeric:D { my $n = 0; # space count for padding purposes with @words[$i] { when // { - $n = .chars; if $!verbatim && (my $last-nl = .rindex("\n")).defined { # count spaces after last new-line - $n -= $last-nl + 1; - $n = 0 if $!squish; + $n = .substr($last-nl+1).comb.map({do with %SpaceWidth{$_} { $_ * $em-spaces } // 1}).sum + unless $!squish; } else { $i++; - $n = 1 if $!squish; - $n = 0 if .contains("\c[ZERO WIDTH SPACE]"); + $n = .comb.map({do with %SpaceWidth{$_} { $_ * $em-spaces } // 1}).sum; + $n = 1 if $n > 1 && $!squish; +dd [.uniname, :$n, :$i]; } } } @@ -381,8 +393,8 @@ method !flush-spaces(@words is raw, $i is rw) returns UInt { } # calculates actual spacing between words -method !word-gap returns Numeric { - my $word-gap = $.space-width + $.WordSpacing + $.CharSpacing; +method !word-gap($space = $.space-width) returns Numeric { + my $word-gap = $space + $.WordSpacing + $.CharSpacing; $word-gap * $.HorizScaling / 100; } diff --git a/lib/PDF/Content/Text/Line.rakumod b/lib/PDF/Content/Text/Line.rakumod index cc812e8..0165f81 100644 --- a/lib/PDF/Content/Text/Line.rakumod +++ b/lib/PDF/Content/Text/Line.rakumod @@ -49,7 +49,7 @@ has Numeric $.word-width is rw = 0; has Numeric $.word-gap is rw = 0; has Numeric $.indent is rw = 0; has Numeric $.align = 0; -has UInt @.spaces; +has Numeric @.spaces; method content-width returns Numeric { $!indent + $!word-width + @!spaces.sum * $!word-gap; @@ -153,9 +153,12 @@ method content(:$font!, Numeric :$font-size!, :$space-pad = 0, :$TextRise = 0.0) for ^+@!encoded -> $i { my $spaces := @!spaces[$i]; if $spaces { - @line.push: $font.encode(Space x $spaces); - @line.push: $space-pad * $spaces - unless $space-pad =~= 0; + my UInt $whole-spaces = $spaces.floor; + my $part-spaces = $spaces - $whole-spaces; + @line.push: $font.encode(Space x $whole-spaces); + my Int $pad = round($space-pad * $spaces + -1000 * $part-spaces * $!word-gap / $font-size); + @line.push: $pad + if $pad; } @line.append: @!encoded[$i].list; } diff --git a/lib/PDF/Content/Text/Style.rakumod b/lib/PDF/Content/Text/Style.rakumod index abe754f..8e4b0e6 100644 --- a/lib/PDF/Content/Text/Style.rakumod +++ b/lib/PDF/Content/Text/Style.rakumod @@ -70,19 +70,19 @@ multi method baseline-shift(Baseline $_ --> Numeric) { #| get/set a numeric font vertical alignment offset multi method baseline-shift is rw { $!TextRise } +method scale($v) { $v * $!font-size / $!units-per-EM; } + #| return the scaled width of spaces -method space-width { - $!space-width * $!font-size / $!units-per-EM; -} +method space-width { self.scale: $!space-width; } #| return the scaled underline position method underline-position { - ($!font.underline-position // -100) * $!font-size / $!units-per-EM; + self.scale: ($!font.underline-position // -100) } #| return the scaled underline thickness method underline-thickness { - ($!font.underline-thickness // 50) * $!font-size / $!units-per-EM; + self.scale: ($!font.underline-thickness // 50) } #| return the scaled font height diff --git a/t/pdf-text-align.pdf b/t/pdf-text-align.pdf index 9be93446e5ea412dfca60c8004d875956c6b506b..4b70f1aaf7879df1803ffb2c776393c3344909b8 100644 GIT binary patch delta 114 zcmeB4>Pgz5%O!2DpplxBS)!w$si13U0^}8^ra;-7J-8V0DwyoS^=WgMC@UvEQ^1Nc Ib%7 delta 114 zcmeB4>Pgz5%O!25pplxBS)!w$si13U0^}8^ra;-7J-8V0DwyoS^=WgMC@UvEQ^1Nc IomE;~l9^Ts=WA(c!P!2O3q-~Bic)hyB9`U~ z8kq&frEpG4eolT7Or|)q1j3yBpEF(=Xl`mwW{D0+$OOnKPECO^Q}Q9K&5c}cTtq4q RV%+>kREv{%)zJz|1OWhpJ&ynY delta 240 zcmeyL{6Bd^AE%Cif<{Sxfr6%ju8}2>omE;~l9^Ts=WA(c!P!2O3q-~Bic)hyB9>+f z8kq&frEpG4eolT7Or|)q1j3yBpEF(=Xl`mwW{D0+$OOnKPECO^Q}Q9K&5c}cTtq4q RV%+>kREv{%)zJz|1OfM1J$nEE diff --git a/t/pdf-text-indent.pdf b/t/pdf-text-indent.pdf index a74815de267d9112105d84e7e64e0827b884fa35..fb6d16080b02b192f440779671598194a4c47e15 100644 GIT binary patch delta 109 zcmX@0cR+7L8LPCBf`(6iQEIM&rh={|kds+ZTngn(p2!*}ZLXk^nv+?g1Clp10rHAd YQ=sh0^#Zw@%h}#?;#0J_Uf>`b0G_HI5dZ)H delta 109 zcmX@0cR+7L8LPCRf`(6iQEIM&rh={|kds+ZTngn(p2!*}ZKj}+nv+?g1Clp10rHAd YQ=sh0^#Zw@%h}#?;#0J_Uf>`b0Gz`f2mk;8 diff --git a/t/pdf-text-style.pdf b/t/pdf-text-style.pdf index 1eb51c9dc854c19359827a69f22cf3cefa555266..c3a18655c3d14b14375eb6343e2026552b70ce14 100644 GIT binary patch delta 461 zcmY+A!Ab)$6h${eJ1xS9bZA}F3`&^}B8o_HH;XRZi-;Q`&cym4Gnpn$5%DX`&XxXw z(614<{SYUamxa5>z3-m9n{BquzOzm;J0r=pknnnwQx4UFOo^3dmpG(p>L&BsBSY&H zf(wbJf>mpNb|(AQqy^NVa?ddj{qd*`VS<5Hr6}4|*qy}I)h&f2oIVQ4{3^2Vy77@? zb69G6iteq?unJ16nQ%BhuzTx9q2&qgEj1`-zYWsRJ;pgLbKz{W8^r0q3z|P@5;ca| ziyx?C>_yYj`Nb7U=18@b^{Tztd<5js!l1&GWVK|^tSXrFAN3Q&uJW33lt{vq@||vk!k+X3s4rdVriC*$_q~4f2fko;JOBUy delta 450 zcmY*V%Sr<=6x|5zv=pk)QW0qeq)tb1W|iB&yu{r1?rq)>WA)l-+RaaXmlwHZ|b)vv1N@9VTs_6QbCYX^nE)%a?t{p zhMy687d%j5>Em`shX+=;ZXjNsl5lBId;V=O?YrlM;AJj7*o)$me;t}XXepmq_vun`)Gi?(1F;-*^{7IY&rwXe-!lF^xo#g!)zky&^E z*Mi_3T;~GCt2n87ncCfB-uJ!lhndx9_0P*nzvFsCXt@Wj-_|4JI3-NtC>c%*NEwn6 zK!tj>Hhll69R#k2;QJJWq%1kchl>zo2!(?BsqA;PB29|*5<8~J2#LmkNXKK~>cies zSJtYh`68mpL?DtF65*Upi-mf&yj=|*GcMD_#63eZnbipN{*=e&)P&GPlo<(1oI^3{ zquZ)iUEj)yMtAu5hg8?qW?rd|9G4ZCBqM5Um{$=2-03nmRaSLcnjJF^J(ONZhbpdz zZYp+=_bEuB?|M@}B9!))S`LA5>CXk9?6n=uHyZz->fP=(^7A3Pxh*?d`~>|7M3`SX sq~$&YT7D?WPS8$sU?5~JK5%L^eSHKEER_*9nCXIF9A~BT_2zurzvWt~)&Kwi delta 690 zcmZ8fO-jQ+6h@jR;zB5>pmq_vun`)Gi%_r&anr2}3%U`R+Sg_<$>_|);>r_<$Sgd7 zYeDc1u5*FnRh-nkOzrM5@B7~O!_1nq=I3RtAG+QUI_`n%hk9fjr<6$?CBta}DI-z= zs8FxghVLJRLEw4_zRy5N%F<(exQIYTP%5aO+J09n(zIMJu@jn(kZ25uY&-_8KI}bp zWvzOeFJhWb1R{wc5zg7PT&QQu+tuJP<1$N4+%q(jTdhFvPkCZaO$bdzm64#tIh3P5 zx~+Qk^{s+vbcc_BNOfIr7nSPBNmYSKGNQ(Yc@;6hoi1}z<#nf{*$Ly&L+OQdsN#C) zre+64pMsS7t~UiFLS=8M

Cu{#@|MUfa=pqwx=_-tBIyI3KZ_+p43*PtcD+M8%~; sTJ0mC)rXSo1nsm3214fI1E8ptT!?>PzcH8Qc#Fi&%utY+{?O8MB!-XR!7%IT%=)8Y&oofI^-E7nosSXkcc9A!cM` YiXmofW-{57BbC#{+>lFE)z#k(00)c~qW}N^ diff --git a/t/text-box.t b/t/text-box.t index 25a4379..7c7d8b9 100644 --- a/t/text-box.t +++ b/t/text-box.t @@ -1,6 +1,6 @@ use v6; use Test; -plan 14; +plan 15; use lib 't'; use PDF::Grammar::Test :is-json-equiv; use PDF::Content::Text::Box; @@ -152,6 +152,17 @@ subtest 'zero width spaces', { } } +subtest 'variable spaces', { + $gfx.text: { + $text = "Spaces:en-space\c[EN SPACE]space tab\tem-space\c[EM SPACE]em-quad\c[EM QUAD]three\c[THREE-PER-EM SPACE]four\c[FOUR-PER-EM SPACE]six\c[SIX-PER-EM SPACE]thin\c[THIN SPACE]hair\c[HAIR SPACE]zero\c[ZERO WIDTH SPACE]. " x 2; + my $width = 400; + $height = 100; + $text-box .= new( :$text, :$font, :$font-size, :$width, :$height ); + .text-position = 100, 250; + .say: $text-box; + } +} + subtest 'font loading from content stream', { if (try require PDF::Font::Loader) === Nil { skip 'PDF::Font::Loader is needed for this test';