Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Find feeds via sitemap #2996

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 82 additions & 8 deletions internal/reader/subscription/finder.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import (
"bytes"
"encoding/xml"
"fmt"
"io"
"log/slog"
Expand Down Expand Up @@ -124,6 +125,14 @@
slog.Debug("Subscriptions found with well-known URLs", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
return subscriptions, nil
}
// Step 7) Check if the website has feeds in its sitemap.
slog.Debug("Try to detect feeds from sitemap", slog.String("website_url", websiteURL))
if subscriptions, localizedError := f.FindSubscriptionsFromSitemap(websiteURL); localizedError != nil {
return nil, localizedError
} else if len(subscriptions) > 0 {
slog.Debug("Subscriptions found with sitemap", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
return subscriptions, nil
}

return nil, nil
}
Expand Down Expand Up @@ -189,14 +198,16 @@

func (f *SubscriptionFinder) FindSubscriptionsFromWellKnownURLs(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
knownURLs := map[string]string{
"atom.xml": parser.FormatAtom,
"feed.xml": parser.FormatAtom,
"feed/": parser.FormatAtom,
"rss.xml": parser.FormatRSS,
"rss/": parser.FormatRSS,
"index.rss": parser.FormatRSS,
"index.xml": parser.FormatRSS,
"feed.atom": parser.FormatAtom,
"atom.xml": parser.FormatAtom,
"feed.xml": parser.FormatAtom,
"feed": parser.FormatAtom,
"rss.xml": parser.FormatRSS,
"rss": parser.FormatRSS,
"index.rss": parser.FormatRSS,
"index.xml": parser.FormatRSS,
"feed.atom": parser.FormatAtom,
"atom": parser.FormatAtom,
"index.atom": parser.FormatAtom,
}

websiteURLRoot := urllib.RootURL(websiteURL)
Expand Down Expand Up @@ -315,3 +326,66 @@

return nil, nil
}

func (f *SubscriptionFinder) FindSubscriptionsFromSitemap(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
websiteURLRoot := urllib.RootURL(websiteURL)

responseHandler := fetcher.NewResponseHandler(f.requestBuilder.ExecuteRequest(websiteURLRoot + "/sitemap.xml"))
defer responseHandler.Close()

if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to find subscriptions", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return nil, localizedError
}

responseBody, localizedError := responseHandler.ReadBody(config.Opts.HTTPClientMaxBodySize())
if localizedError != nil {
slog.Warn("Unable to find subscriptions", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return nil, localizedError
}
return findSubscriptionsFromDownloadedSitemap(bytes.NewReader(responseBody))
}

func findSubscriptionsFromDownloadedSitemap(body io.Reader) (Subscriptions, *locale.LocalizedErrorWrapper) {
var subscriptions Subscriptions
loc := struct {
Content string `xml:",chardata"`
}{}

decoder := xml.NewDecoder(body)
for {
t, _ := decoder.Token()
if t == nil {
break
}
switch se := t.(type) {

Check failure on line 361 in internal/reader/subscription/finder.go

View workflow job for this annotation

GitHub Actions / Golang Linters

singleCaseSwitch: should rewrite switch statement to if statement (gocritic)
case xml.StartElement:
if se.Name.Local != "loc" {
continue
}

if err := decoder.DecodeElement(&loc, &se); err != nil {
slog.Warn("Unable to decode loc", slog.Any("error", err))
}
feedUrl := loc.Content
switch {
case strings.Contains(feedUrl, ".xml"),
strings.Contains(feedUrl, "rss"):
subscriptions = append(subscriptions, &Subscription{
Type: parser.FormatRSS,
Title: feedUrl,
URL: feedUrl,
})
case strings.Contains(feedUrl, "feed"),
strings.Contains(feedUrl, "atom"):
subscriptions = append(subscriptions, &Subscription{
Type: parser.FormatAtom,
Title: feedUrl,
URL: feedUrl,
})
}
}
}

return subscriptions, nil
}
40 changes: 40 additions & 0 deletions internal/reader/subscription/finder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -481,3 +481,43 @@ func TestParseWebPageWithNoHref(t *testing.T) {
t.Fatal(`Incorrect number of subscriptions returned`)
}
}

func TestParseSiteMap(t *testing.T) {
htmlPage := `
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<url>
<loc>http://www.example.com/</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>http://www.example.com/feed/myfeed</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>http://www.example.com/myfeed.xml</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
<url>
<loc>http://www.example.com/atom_feed.xml</loc>
<lastmod>2005-01-01</lastmod>
<changefreq>monthly</changefreq>
<priority>0.8</priority>
</url>
</urlset> `

subscriptions, err := findSubscriptionsFromDownloadedSitemap(strings.NewReader(htmlPage))
if err != nil {
t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
}

if len(subscriptions) != 3 {
t.Fatal(`Incorrect number of subscriptions returned`)
}
}
Loading