Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Fix redirect scraping #957

Merged
merged 2 commits into from
Jan 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified docker/sswlinkauditor
Binary file not shown.
118 changes: 89 additions & 29 deletions docker/sswlinkauditor.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,34 @@ func addClientHeaders(r *http.Request) {
}
}

func isRedirect(resp *http.Response) bool{
return (resp.StatusCode > 300 && resp.StatusCode < 400)
}

func getRedirectLocation(url string, client *http.Client) string {
resp, err := client.Get(url)
if err != nil {
fmt.Println("encountered error following redirect chain: ",url, err)
return url
}
if isRedirect(resp) {
var redirectLocation = resp.Header.Get("Location")
return getRedirectLocation(redirectLocation, client)
}
return url
}

func getRedirectChainFinalUrl(url string) string {
client := &http.Client{
// prevents the client from following redirects
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
},
Timeout: 1 * time.Minute,
}
return getRedirectLocation(url, client)
}

func check(link Link, linkch chan LinkStatus, number int) {
fmt.Println("CHEC", number, link.url)

Expand Down Expand Up @@ -106,10 +134,19 @@ func check(link Link, linkch chan LinkStatus, number int) {
}
}

func isSameOrigin(url1 string, url2 string) bool {
url1Parsed, _ := urlP.Parse(url1)
url2Parsed, _ := urlP.Parse(url2)
return url1Parsed.Host == url2Parsed.Host
}

func crawl(link Link, ch chan Link, linkch chan LinkStatus, number int) {
fmt.Println("CRAW", number, link.url)

client := &http.Client{
// prevents the client from following redirects to end of chain
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
},
Timeout: 1 * time.Minute,
}
resp, err := client.Get(link.url)
Expand All @@ -135,43 +172,66 @@ func crawl(link Link, ch chan Link, linkch chan LinkStatus, number int) {
b := resp.Body
defer b.Close()

z := html.NewTokenizer(b)
// check whether the response was a redirect
if isRedirect(resp) {
finalUrl := getRedirectChainFinalUrl(link.url)

if isSameOrigin(link.url, finalUrl) {
// if the url is on the same origin, add it to the channel to be scraped
ch <- Link{finalUrl, link.url, "a", link.anchor + " (redirected)"}
}

// check if the final page 404s or not
newResp, newErr := client.Get(finalUrl)

if newErr == nil {
// use the response from the end of the redirect chain to determine if the current link
// is valid
resp = newResp
err = newErr

// return prematurely to skip scraping the HTML from redirect urls
// if the destination of the redirect is on the same origin it will be scraped later
return
}
}
scrapeLinksFromHtml(ch, link.url, b)
}

func scrapeLinksFromHtml(ch chan Link, pageFound string, htmlString io.ReadCloser) {
z := html.NewTokenizer(htmlString)
depth := 0
var linkUrl string
for {
tt := z.Next()
switch tt {
case html.ErrorToken:
err := z.Err()
if err == io.EOF {
case html.ErrorToken:
err := z.Err()
if err == io.EOF {
return
}
fmt.Println("Error with tokenizer", err)
return
}
fmt.Println("Error with tokenizer", err)
return

case html.TextToken:
if depth > 0 {
text := strings.TrimSpace(string(z.Text()))
ch <- Link{linkUrl, link.url, "a", text}
}
case html.StartTagToken, html.SelfClosingTagToken, html.EndTagToken:
t := z.Token()

if t.Data == "a" || t.Data == "img" || t.Data == "link" || t.Data == "iframe" {
_, newUrl := getHref(t)
if t.Data == "a" {
linkUrl = newUrl
if tt == html.StartTagToken {
depth++
} else if tt == html.EndTagToken {
depth--
case html.TextToken:
if depth > 0 {
text := strings.TrimSpace(string(z.Text()))
ch <- Link{linkUrl, pageFound, "a", text}
}
case html.StartTagToken, html.SelfClosingTagToken, html.EndTagToken:
t := z.Token()
if t.Data == "a" || t.Data == "img" || t.Data == "link" || t.Data == "iframe" {
_, newUrl := getHref(t)
if t.Data == "a" {
linkUrl = newUrl
if tt == html.StartTagToken {
depth++
} else if tt == html.EndTagToken {
depth--
}
} else {
ch <- Link{newUrl, pageFound, t.Data, ""}
}
} else {
ch <- Link{newUrl, link.url, t.Data, ""}
}
}

}
}
}
Expand Down
Loading