diff --git a/htmlindex/attributes.go b/htmlindex/attributes.go
index 6bba96a..d3b8dd1 100644
--- a/htmlindex/attributes.go
+++ b/htmlindex/attributes.go
@@ -32,17 +32,21 @@ const (
BackgroundAttribute = "background"
HrefAttribute = "href"
- DataSrcAttribute = "data-src"
- SrcAttribute = "src"
+ DataSrcAttribute = "data-src"
+ SrcAttribute = "src"
+ DataImageAttribute = "data-image"
DataSrcSetAttribute = "data-srcset"
SrcSetAttribute = "srcset"
+
+ DataUrlAttribute = "data-url"
)
// nolint: revive
const (
ATag = "a"
BodyTag = "body"
+ DivTag = "div"
ImgTag = "img"
LinkTag = "link"
ScriptTag = "script"
@@ -57,8 +61,11 @@ var Nodes = map[string]Node{
BodyTag: {
Attributes: []string{BackgroundAttribute},
},
+ DivTag: {
+ Attributes: []string{DataUrlAttribute},
+ },
ImgTag: {
- Attributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute},
+ Attributes: []string{SrcAttribute, DataSrcAttribute, SrcSetAttribute, DataSrcSetAttribute, DataImageAttribute},
parser: srcSetValueSplitter,
},
LinkTag: {
diff --git a/htmlindex/htmlindex.go b/htmlindex/htmlindex.go
index 9749d3b..911d539 100644
--- a/htmlindex/htmlindex.go
+++ b/htmlindex/htmlindex.go
@@ -165,8 +165,11 @@ func srcSetValueSplitter(data nodeAttributeParserData) ([]string, bool) {
for i, value := range values {
value = strings.TrimSpace(value)
+ // remove extra query param arguments that sometimes remain attached
+ value = strings.Split(value, "?")[0]
// remove the width in pixels after the url
- values[i], _, _ = strings.Cut(value, " ")
+ value, _, _ = strings.Cut(value, " ")
+ values[i] = value
}
return values, true
diff --git a/scraper/download.go b/scraper/download.go
index 0730970..998024d 100644
--- a/scraper/download.go
+++ b/scraper/download.go
@@ -21,6 +21,7 @@ var tagsWithReferences = []string{
htmlindex.ScriptTag,
htmlindex.BodyTag,
htmlindex.StyleTag,
+ htmlindex.DivTag,
}
func (s *Scraper) downloadReferences(ctx context.Context, index *htmlindex.Index) error {