Skip to content

Commit

Permalink
Convert to and from Markdown to sanitise
Browse files Browse the repository at this point in the history
  • Loading branch information
TheMightyGit committed Nov 9, 2024
1 parent 5dede72 commit 2fe2541
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 171 deletions.
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,18 @@ go 1.23
toolchain go1.23.2

require (
github.com/JohannesKaufmann/html-to-markdown/v2 v2.1.0
github.com/NYTimes/gziphandler v1.1.1
github.com/andybalholm/cascadia v1.3.2
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81
github.com/k3a/html2text v1.2.1
github.com/mmcdole/gofeed v1.3.0
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f
golang.org/x/net v0.31.0
)

require (
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 // indirect
github.com/PuerkitoBio/goquery v1.10.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mmcdole/goxpp v1.1.1 // indirect
Expand Down
20 changes: 12 additions & 8 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364 h1:TDlO/A2QqlNhdvH+hDnu8cv1rouhfHgLwhGzJeHGgFQ=
github.com/JohannesKaufmann/dom v0.1.1-0.20240706125338-ff9f3b772364/go.mod h1:U+fBZLZTYiZCOwQUT04V3J4I+0TxyLNnj0R8nBlO4fk=
github.com/JohannesKaufmann/html-to-markdown/v2 v2.1.0 h1:k6vBBqTmQOqLnaYkELgCU/F9xVPt3xhO1754hvlP/HM=
github.com/JohannesKaufmann/html-to-markdown/v2 v2.1.0/go.mod h1:djCj8ehU80KpSAepQciLcNzrp8hwZ1vQFnYKRo4/Cio=
github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I=
github.com/NYTimes/gziphandler v1.1.1/go.mod h1:n/CVRwUEOgIxrgPvAQhUUr9oeUtvrhMomdKFjzJNB0c=
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
github.com/PuerkitoBio/goquery v1.10.0 h1:6fiXdLuUvYs2OJSvNRqlNPoBm6YABE226xrbavY5Wv4=
github.com/PuerkitoBio/goquery v1.10.0/go.mod h1:TjZZl68Q3eGHNBA8CWaxAN7rOU1EbDz3CWuolcO5Yu4=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81 h1:5lyLWsV+qCkoYqsKUDuycESh9DEIPVKN6iCFeL7ag50=
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1 h1:EGx4pi6eqNxGaHF6qqu48+N2wcFQ5qg5FXgOdqsJ5d8=
github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
Expand All @@ -29,6 +33,10 @@ github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9G
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/sebdah/goldie/v2 v2.5.5 h1:rx1mwF95RxZ3/83sdS4Yp7t2C5TCokvWP4TBRbAyEWY=
github.com/sebdah/goldie/v2 v2.5.5/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
github.com/sergi/go-diff v1.3.1 h1:xkr+Oxo4BOQKmkn/B9eMK0g5Kg/983T9DqqPHwYqD+8=
github.com/sergi/go-diff v1.3.1/go.mod h1:aMJSSKb2lpPvRNec0+w3fl7LP9IOFzdc9Pa4NFbPK1I=
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM=
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s=
Expand All @@ -38,10 +46,10 @@ github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UV
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
github.com/yuin/goldmark v1.7.8 h1:iERMLn0/QJeHFhxSt3p6PeN9mGnvIKSpG9YYorDMnic=
github.com/yuin/goldmark v1.7.8/go.mod h1:uzxRWxtg69N339t3louHJ7+O03ezfj6PlliRlaOzY1E=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948 h1:kx6Ds3MlpiUHKj7syVnbp57++8WpuKPcR5yjLBjvLEA=
golang.org/x/exp v0.0.0-20240823005443-9b4947da3948/go.mod h1:akd2r19cwCdwSwWeIdzYQGa/EZZyqcOdwWiwj5L5eKQ=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
Expand All @@ -52,8 +60,6 @@ golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.28.0 h1:a9JDOJc5GMUJ0+UDqmLT86WiEy7iWyIhz8gz8E4e5hE=
golang.org/x/net v0.28.0/go.mod h1:yqtgsTWOOnlGLG9GFRrK3++bGOUEkNBoHZc8MEDWPNg=
golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo=
golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand All @@ -75,8 +81,6 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/text v0.17.0 h1:XtiM5bkSOt+ewxlOE/aE/AKEHibwj/6gvWMl9Rsh0Qc=
golang.org/x/text v0.17.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY=
golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug=
golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
Expand Down
171 changes: 46 additions & 125 deletions internal/rssole/item.go
Original file line number Diff line number Diff line change
@@ -1,30 +1,31 @@
package rssole

import (
"bytes"
"crypto/md5"
"encoding/hex"
"log/slog"
"net/url"
"regexp"
"strings"
"sync"

htmltomarkdown "github.com/JohannesKaufmann/html-to-markdown/v2"
"github.com/k3a/html2text"
"github.com/mmcdole/gofeed"
"golang.org/x/exp/slog"
"golang.org/x/net/html"

"github.com/gomarkdown/markdown"
"github.com/gomarkdown/markdown/html"
"github.com/gomarkdown/markdown/parser"
)

type wrappedItem struct {
IsUnread bool
Feed *feed
*gofeed.Item

summary *string
description *string
descriptionImagesForDedupe *[]string
images *[]string
onceDescription sync.Once
summary *string
description *string
images *[]string
onceDescription sync.Once
}

func (w *wrappedItem) MarkReadID() string {
Expand All @@ -46,27 +47,13 @@ func (w *wrappedItem) Images() []string {

images := []string{}

// NOTE: we exclude images that already appear in the description (gibiz)

// standard supplied image
if w.Item.Image != nil {
if !w.isDescriptionImage(w.Item.Image.URL) {
// fmt.Println(w.Item.Image.URL)
images = append(images, w.Item.Image.URL)
}
}

// mastodon/gibiz images
if media, found := w.Item.Extensions["media"]; found {
if content, found := media["content"]; found {
for _, v := range content {
if v.Attrs["medium"] == "image" {
imageURL := v.Attrs["url"]
if !w.isDescriptionImage(imageURL) {
// fmt.Println(w.Description())
// fmt.Printf("%v = %+v\n", k, imageUrl)
images = append(images, imageURL)
}
images = append(images, imageURL)
}
}
}
Expand All @@ -91,37 +78,28 @@ func (w *wrappedItem) Images() []string {
}
}

w.images = &images

return *w.images
}

func (w *wrappedItem) isDescriptionImage(src string) bool {
// strip anything after ? to get rid of query string part
srcNoQueryString := strings.Split(src, "?")[0]
// Now... remove any meta images that are embedded in the description.
// Ignore any query string args.

if w.descriptionImagesForDedupe == nil {
// force lazy load if it hasn't already
_ = w.Description()
}
dedupedImages := []string{}

for _, v := range *w.descriptionImagesForDedupe {
// fmt.Println(v, "==", src)
if v == srcNoQueryString {
return true
for _, img := range images {
srcNoQueryString := strings.Split(img, "?")[0]
if !strings.Contains(w.Description(), srcNoQueryString) {
dedupedImages = append(dedupedImages, img)
} else {
slog.Info("dedeuped meta image as already found in content", "src", img)
}
}

return false
}
w.images = &dedupedImages

var (
tagsToRemoveRe = regexp.MustCompile("script|style|link|meta|iframe|form")
attrsToRemoveRe = regexp.MustCompile("style|class|hx-.*|data-.*|srcset|width|height|sizes|loading|decoding|target")
)
return *w.images
}

func (w *wrappedItem) Description() string {
w.onceDescription.Do(func() {

Check failure on line 101 in internal/rssole/item.go

View workflow job for this annotation

GitHub Actions / build

block should not start with a whitespace (wsl)

// create a list of descriptions from various sources,
// we'll pick the longest later on.
descSources := []*string{
Expand Down Expand Up @@ -156,89 +134,30 @@ func (w *wrappedItem) Description() string {
}
}

// try and sanitise any html
doc, err := html.Parse(strings.NewReader(*desc))
// Now simplify the (potential) HTML by converting
// it to and from markdown.

// First convert rando HTML to Markdown....
doc, err := htmltomarkdown.ConvertString(*desc)
if err != nil {

Check failure on line 142 in internal/rssole/item.go

View workflow job for this annotation

GitHub Actions / build

ifElseChain: rewrite if-else to switch statement (gocritic)
// failed to sanitise, so just return as is...
slog.Warn("html.Parse failed, returning unsanitised content", "error", err)
slog.Warn("htmltomarkdown.ConvertString failed, returning unsanitised content", "error", err)

w.description = desc
} else {
w.descriptionImagesForDedupe = &[]string{}
toDelete := []*html.Node{}

var f func(*html.Node)
f = func(n *html.Node) {
// fmt.Println(n)
if n.Type == html.ElementNode {
// fmt.Println(n.Data)
if tagsToRemoveRe.MatchString(n.Data) {
// fmt.Println("removing", n.Data, "tag")
toDelete = append(toDelete, n)

return
}

allowedAttrs := []html.Attribute{}

for i := range n.Attr {
if !attrsToRemoveRe.MatchString(n.Attr[i].Key) {
allowedAttrs = append(allowedAttrs, n.Attr[i])
}
}

n.Attr = allowedAttrs

if n.Data == "a" {
// fmt.Println("making", n.Data, "tag target new tab")
n.Attr = append(n.Attr, html.Attribute{
Namespace: "",
Key: "target",
Val: "_new",
})
// disable href if it starts with #
for i := range n.Attr {
if n.Attr[i].Key == "href" && n.Attr[i].Val[0] == '#' {
n.Attr[i].Key = "xxxhref" // easier than removing the attr

break
}
}
}

if n.Data == "img" || n.Data == "svg" {
// fmt.Println("making", n.Data, "tag style max-width 60%")
n.Attr = append(n.Attr, html.Attribute{
Namespace: "",
Key: "style",
Val: "max-width: 60%;",
})
// keep a note of images so we can de-dupe attached
// images that also appear in the content.
for _, a := range n.Attr {
if a.Key == "src" {
// strip anything after ? to get rid of query string part
bits := strings.Split(a.Val, "?")
*w.descriptionImagesForDedupe = append(*w.descriptionImagesForDedupe, bits[0])
}
}
}
}
} else if doc == "" {
slog.Warn("htmltomarkdown.ConvertString result blank, using original.")

for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)

for _, n := range toDelete {
n.Parent.RemoveChild(n)
}

renderBuf := bytes.NewBufferString("")
_ = html.Render(renderBuf, doc)
desc := renderBuf.String()
w.description = &desc
w.description = desc
} else {
// parse markdown
p := parser.NewWithExtensions(parser.CommonExtensions | parser.AutoHeadingIDs | parser.NoEmptyLineBeforeBlock)
md := p.Parse([]byte(doc))

// render to HTML (we choose to exclude embedded images and rely on them being passed in metadata)
renderer := html.NewRenderer(html.RendererOptions{
Flags: html.CommonFlags | html.HrefTargetBlank,
})
mdHTML := string(markdown.Render(md, renderer))
w.description = &mdHTML
}
})

Expand All @@ -257,6 +176,8 @@ func (w *wrappedItem) Summary() string {
plainDesc = plainDesc[:maxDescriptionLength]
}

plainDesc = strings.TrimSpace(plainDesc)

// if summary is identical to title return nothing
if plainDesc == w.Title {
plainDesc = ""
Expand Down
43 changes: 6 additions & 37 deletions internal/rssole/item_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,51 +61,20 @@ func TestDescription_HtmlSanitised(t *testing.T) {
<meta foo="Should Be Deleted">
<iframe>Should Be Deleted</iframe>
<a></a>
<img >
<svg />
<img foo="bar" width="10000" src="http://example.com/example.gif" alt="my alt" />
<svg width="200" height="250" version="1.1" xmlns="http://www.w3.org/2000/svg">
</svg>
<form></form>
`,
},
}
expectedHTML := `<html><head>
</head><body>
<a target="_new"></a>
<img style="max-width: 60%;"/>
<svg style="max-width: 60%;"></svg>
</body></html>`
expectedHTML := `<p><img src="http://example.com/example.gif" alt="my alt" /></p>
`

d := w.Description()

if d != expectedHTML {
t.Fatal("description not as expected. got", d, "expected:", expectedHTML)
}
}

func TestIsDescriptionImage(t *testing.T) {
w := wrappedItem{
Item: &gofeed.Item{
Description: `
<img src='this_image_is_present' />
<svg src='this_svg_is_present' />
<button src='this_not_an_image' />
`,
},
}

if !w.isDescriptionImage("this_image_is_present") {
t.Error("expected to find 'this_image_is_present' in description images")
}

if !w.isDescriptionImage("this_svg_is_present") {
t.Error("expected to find 'this_svg_is_present' in description images")
}

if w.isDescriptionImage("this_not_an_image") {
t.Error("expected not to find 'this_not_an_image' in description images")
t.Fatal("description not as expected. got:", d, "expected:", expectedHTML)
}
}

Expand Down
5 changes: 5 additions & 0 deletions internal/rssole/templates/base.go.html
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
#feeds .badge {
width: 3.5em;
}

.embeddedcontent img {
max-width: 60%;
}

</style>
</head>
<body>
Expand Down
Loading

0 comments on commit 2fe2541

Please sign in to comment.