diff --git a/htmlindex/attributes.go b/htmlindex/attributes.go index 6bba96a..d3b8dd1 100644 --- a/htmlindex/attributes.go +++ b/htmlindex/attributes.go @@ -32,17 +32,21 @@ const ( BackgroundAttribute = "background" HrefAttribute = "href" - DataSrcAttribute = "data-src" - SrcAttribute = "src" + DataSrcAttribute = "data-src" + SrcAttribute = "src" + DataImageAttribute = "data-image" DataSrcSetAttribute = "data-srcset" SrcSetAttribute = "srcset" + + DataUrlAttribute = "data-url" ) // nolint: revive const ( ATag = "a" BodyTag = "body" + DivTag = "div" ImgTag = "img" LinkTag = "link" ScriptTag = "script" @@ -57,8 +61,11 @@ var Nodes = map[string]Node{ BodyTag: { Attributes: []string{BackgroundAttribute}, }, + DivTag: { + Attributes: []string{DataUrlAttribute}, + }, ImgTag: { - Attributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute}, + Attributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute, DataImageAttribute}, parser: srcSetValueSplitter, }, LinkTag: { diff --git a/htmlindex/htmlindex.go b/htmlindex/htmlindex.go index 9749d3b..911d539 100644 --- a/htmlindex/htmlindex.go +++ b/htmlindex/htmlindex.go @@ -165,8 +165,11 @@ func srcSetValueSplitter(data nodeAttributeParserData) ([]string, bool) { for i, value := range values { value = strings.TrimSpace(value) + // remove extra query param arguments that sometimes remain attached + value = strings.Split(value, "?")[0] // remove the width in pixels after the url - values[i], _, _ = strings.Cut(value, " ") + value, _, _ = strings.Cut(value, " ") + values[i] = value } return values, true diff --git a/scraper/download.go b/scraper/download.go index 0730970..998024d 100644 --- a/scraper/download.go +++ b/scraper/download.go @@ -21,6 +21,7 @@ var tagsWithReferences = []string{ htmlindex.ScriptTag, htmlindex.BodyTag, htmlindex.StyleTag, + htmlindex.DivTag, } func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error {