From ed5f5189b052eec03de111970b34dce9ca1732c0 Mon Sep 17 00:00:00 2001 From: Slatian Date: Sat, 19 Nov 2022 15:45:52 +0100 Subject: [PATCH] Added a little check for the response code to not index pages that return errors or finish with codes in the 100 range --- crawler/crawler.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/crawler/crawler.go b/crawler/crawler.go index b9afec0..7cb960e 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -294,6 +294,11 @@ func Crawl(config types.Config) { // on every a element which has an href attribute, call callback c.OnHTML("a[href]", func(e *colly.HTMLElement) { + + if e.Response.StatusCode >= 400 || e.Response.StatusCode <= 100 { + return + } + link := getLink(e.Attr("href")) if findSuffix(SUFFIXES, link) { return