Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Limit TitleExtractor to allow only Remark42 whitelisted domains #1681

Merged
merged 2 commits into from
Oct 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 40 additions & 1 deletion backend/app/cmd/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"embed"
"fmt"
"net"
"net/http"
"net/url"
"os"
Expand Down Expand Up @@ -504,7 +505,7 @@ func (s *ServerCommand) newServerApp(ctx context.Context) (*serverApp, error) {
MaxVotes: s.MaxVotes,
PositiveScore: s.PositiveScore,
ImageService: imageService,
TitleExtractor: service.NewTitleExtractor(http.Client{Timeout: time.Second * 5}),
TitleExtractor: service.NewTitleExtractor(http.Client{Timeout: time.Second * 5}, s.getAllowedDomains()),
RestrictedWordsMatcher: service.NewRestrictedWordsMatcher(service.StaticRestrictedWordsLister{Words: s.RestrictedWords}),
}
dataService.RestrictSameIPVotes.Enabled = s.RestrictVoteIP
Expand Down Expand Up @@ -633,6 +634,44 @@ func (s *ServerCommand) newServerApp(ctx context.Context) (*serverApp, error) {
}, nil
}

// Extract second level domains from s.RemarkURL and s.AllowedHosts.
// It can be and IP like http//127.0.0.1 in which case we need to use whole IP as domain
func (s *ServerCommand) getAllowedDomains() []string {
rawDomains := s.AllowedHosts
rawDomains = append(rawDomains, s.RemarkURL)
allowedDomains := []string{}
for _, rawURL := range rawDomains {
// case of 'self' AllowedHosts, which is not a valid rawURL name
if rawURL == "self" || rawURL == "'self'" || rawURL == "\"self\"" {
continue
}
// AllowedHosts usually don't have https:// prefix, so we're adding it just to make parsing below work the same way as for RemarkURL
if !strings.HasPrefix(rawURL, "http://") && !strings.HasPrefix(rawURL, "https://") {
rawURL = "https://" + rawURL
paskal marked this conversation as resolved.
Show resolved Hide resolved
}
parsedURL, err := url.Parse(rawURL)
if err != nil {
log.Printf("[WARN] failed to parse URL %s for TitleExtract whitelist: %v", rawURL, err)
continue
}
domain := parsedURL.Hostname()

if domain == "" || // don't add empty domain as it will allow everything to be extracted
(len(strings.Split(domain, ".")) < 2 && // don't allow single-word domains like "com"
domain != "localhost") { // localhost is an exceptional single-word domain which is allowed
continue
}

// if domain is not IP and has more than two levels, extract second level domain
if net.ParseIP(domain) == nil && len(strings.Split(domain, ".")) > 2 {
domain = strings.Join(strings.Split(domain, ".")[len(strings.Split(domain, "."))-2:], ".")
}

allowedDomains = append(allowedDomains, domain)
}
return allowedDomains
}

// Run all application objects
func (a *serverApp) run(ctx context.Context) error {
if a.AdminPasswd != "" {
Expand Down
22 changes: 22 additions & 0 deletions backend/app/cmd/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -726,6 +726,28 @@ func Test_splitAtCommas(t *testing.T) {
}
}

func Test_getAllowedDomains(t *testing.T) {
tbl := []struct {
s ServerCommand
allowedDomains []string
}{
// correct example, parsed and returned as allowed domain
{ServerCommand{AllowedHosts: []string{}, CommonOpts: CommonOpts{RemarkURL: "https://remark42.example.org"}}, []string{"example.org"}},
paskal marked this conversation as resolved.
Show resolved Hide resolved
{ServerCommand{AllowedHosts: []string{}, CommonOpts: CommonOpts{RemarkURL: "http://remark42.example.org"}}, []string{"example.org"}},
{ServerCommand{AllowedHosts: []string{}, CommonOpts: CommonOpts{RemarkURL: "http://localhost"}}, []string{"localhost"}},
// incorrect URLs, so Hostname is empty but returned list doesn't include empty string as it would allow any domain
{ServerCommand{AllowedHosts: []string{}, CommonOpts: CommonOpts{RemarkURL: "bad hostname"}}, []string{}},
{ServerCommand{AllowedHosts: []string{}, CommonOpts: CommonOpts{RemarkURL: "not_a_hostname"}}, []string{}},
// test removal of 'self', multiple AllowedHosts. No deduplication is expected
{ServerCommand{AllowedHosts: []string{"'self'", "example.org", "test.example.org", "remark42.com"}, CommonOpts: CommonOpts{RemarkURL: "https://example.org"}}, []string{"example.org", "example.org", "remark42.com", "example.org"}},
}
for i, tt := range tbl {
t.Run(strconv.Itoa(i), func(t *testing.T) {
assert.Equal(t, tt.allowedDomains, tt.s.getAllowedDomains())
})
}
}

func chooseRandomUnusedPort() (port int) {
for i := 0; i < 10; i++ {
port = 40000 + int(rand.Int31n(10000))
Expand Down
2 changes: 1 addition & 1 deletion backend/app/rest/api/admin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ func TestAdmin_Title(t *testing.T) {
ts, srv, teardown := startupT(t)
defer teardown()

srv.DataService.TitleExtractor = service.NewTitleExtractor(http.Client{Timeout: time.Second})
srv.DataService.TitleExtractor = service.NewTitleExtractor(http.Client{Timeout: time.Second}, []string{"127.0.0.1"})
tss := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.String() == "/post1" {
_, err := w.Write([]byte("<html><title>post1 blah 123</title><body> 2222</body></html>"))
Expand Down
5 changes: 3 additions & 2 deletions backend/app/rest/api/rest_public_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -480,7 +480,7 @@ func TestRest_FindUserComments(t *testing.T) {

func TestRest_FindUserComments_CWE_918(t *testing.T) {
ts, srv, teardown := startupT(t)
srv.DataService.TitleExtractor = service.NewTitleExtractor(http.Client{Timeout: time.Second}) // required for extracting the title, bad URL test
srv.DataService.TitleExtractor = service.NewTitleExtractor(http.Client{Timeout: time.Second}, []string{"radio-t.com"}) // required for extracting the title, bad URL test
defer srv.DataService.TitleExtractor.Close()
defer teardown()

Expand All @@ -496,7 +496,8 @@ func TestRest_FindUserComments_CWE_918(t *testing.T) {

assert.False(t, backendRequestedArbitraryServer)
addComment(t, arbitraryURLComment, ts)
assert.True(t, backendRequestedArbitraryServer)
assert.False(t, backendRequestedArbitraryServer,
"no request is expected to the test server as it's not in the list of the allowed domains for the title extractor")

res, code := get(t, ts.URL+"/api/v1/comments?site=remark42&user=provider1_dev")
assert.Equal(t, http.StatusOK, code)
Expand Down
10 changes: 5 additions & 5 deletions backend/app/store/service/service_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ func TestService_CreateFromPartialWithTitle(t *testing.T) {
eng, teardown := prepStoreEngine(t)
defer teardown()
b := DataStore{Engine: eng, AdminStore: ks,
TitleExtractor: NewTitleExtractor(http.Client{Timeout: 5 * time.Second})}
TitleExtractor: NewTitleExtractor(http.Client{Timeout: 5 * time.Second}, []string{"127.0.0.1"})}
defer b.Close()

postPath := "/post/42"
Expand Down Expand Up @@ -195,7 +195,7 @@ func TestService_SetTitle(t *testing.T) {
eng, teardown := prepStoreEngine(t)
defer teardown()
b := DataStore{Engine: eng, AdminStore: ks,
TitleExtractor: NewTitleExtractor(http.Client{Timeout: 5 * time.Second})}
TitleExtractor: NewTitleExtractor(http.Client{Timeout: 5 * time.Second}, []string{"127.0.0.1"})}
defer b.Close()
comment := store.Comment{
Text: "text",
Expand Down Expand Up @@ -1658,10 +1658,10 @@ func TestService_DoubleClose_Static(t *testing.T) {
eng, teardown := prepStoreEngine(t)
defer teardown()
b := DataStore{Engine: eng, AdminStore: ks,
TitleExtractor: NewTitleExtractor(http.Client{Timeout: 5 * time.Second})}
b.Close()
TitleExtractor: NewTitleExtractor(http.Client{Timeout: 5 * time.Second}, []string{})}
assert.NoError(t, b.Close())
// second call should not result in panic or errors
b.Close()
assert.NoError(t, b.Close())
}

// makes new boltdb, put two records
Expand Down
45 changes: 33 additions & 12 deletions backend/app/store/service/title.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"io"
"net/http"
"net/url"
"strings"
"time"

Expand All @@ -19,14 +20,17 @@ const (

// TitleExtractor gets html title from remote page, cached
type TitleExtractor struct {
client http.Client
cache lcw.LoadingCache
client http.Client
cache lcw.LoadingCache
allowedDomains []string
}

// NewTitleExtractor makes extractor with cache. If memory cache failed, switching to no-cache
func NewTitleExtractor(client http.Client) *TitleExtractor {
func NewTitleExtractor(client http.Client, allowedDomains []string) *TitleExtractor {
log.Printf("[DEBUG] creating extractor, allowed domains %+v", allowedDomains)
res := TitleExtractor{
client: client,
client: client,
allowedDomains: allowedDomains,
}
paskal marked this conversation as resolved.
Show resolved Hide resolved
var err error
res.cache, err = lcw.NewExpirableCache(lcw.TTL(teCacheTTL), lcw.MaxKeySize(teCacheMaxRecs))
Expand All @@ -38,33 +42,50 @@ func NewTitleExtractor(client http.Client) *TitleExtractor {
}

// Get page for url and return title
func (t *TitleExtractor) Get(url string) (string, error) {
func (t *TitleExtractor) Get(pageURL string) (string, error) {
// parse domain of the URL and check if it's in the allowed list
u, err := url.Parse(pageURL)
if err != nil {
return "", fmt.Errorf("failed to parse url %s: %w", pageURL, err)
}
allowed := false
for _, domain := range t.allowedDomains {
if u.Hostname() == domain ||
(strings.HasSuffix(u.Hostname(), domain) && // suffix match, e.g. "example.com" matches "www.example.com"
u.Hostname()[len(u.Hostname())-len(domain)-1] == '.') { // but we should not match "notexample.com"
allowed = true
break
}
}
if !allowed {
return "", fmt.Errorf("domain %s is not allowed", u.Host)
}
client := http.Client{Timeout: t.client.Timeout, Transport: t.client.Transport}
defer client.CloseIdleConnections()
b, err := t.cache.Get(url, func() (interface{}, error) {
resp, err := client.Get(url)
if err != nil {
return nil, fmt.Errorf("failed to load page %s: %w", url, err)
b, err := t.cache.Get(pageURL, func() (interface{}, error) {
resp, e := client.Get(pageURL)
if e != nil {
return nil, fmt.Errorf("failed to load page %s: %w", pageURL, e)
}
defer func() {
if err = resp.Body.Close(); err != nil {
log.Printf("[WARN] failed to close title extractor body, %v", err)
}
}()
if resp.StatusCode != 200 {
return nil, fmt.Errorf("can't load page %s, code %d", url, resp.StatusCode)
return nil, fmt.Errorf("can't load page %s, code %d", pageURL, resp.StatusCode)
}

title, ok := t.getTitle(resp.Body)
if !ok {
return nil, fmt.Errorf("can't get title for %s", url)
return nil, fmt.Errorf("can't get title for %s", pageURL)
}
return title, nil
})

// on error save result (empty string) to cache too and return "" title
if err != nil {
_, _ = t.cache.Get(url, func() (interface{}, error) { return "", nil })
_, _ = t.cache.Get(pageURL, func() (interface{}, error) { return "", nil })
return "", err
}

Expand Down
16 changes: 8 additions & 8 deletions backend/app/store/service/title_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func TestTitle_GetTitle(t *testing.T) {
{`<html><body> 2222</body></html>`, false, ""},
}

ex := NewTitleExtractor(http.Client{Timeout: 5 * time.Second})
ex := NewTitleExtractor(http.Client{Timeout: 5 * time.Second}, []string{})
defer ex.Close()
for i, tt := range tbl {
tt := tt
Expand All @@ -41,7 +41,7 @@ func TestTitle_GetTitle(t *testing.T) {
}

func TestTitle_Get(t *testing.T) {
ex := NewTitleExtractor(http.Client{Timeout: 5 * time.Second})
ex := NewTitleExtractor(http.Client{Timeout: 5 * time.Second}, []string{"127.0.0.1"})
defer ex.Close()
var hits int32
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Expand Down Expand Up @@ -75,7 +75,7 @@ func TestTitle_GetConcurrent(t *testing.T) {
for n := 0; n < 1000; n++ {
body += "something something blah blah\n"
}
ex := NewTitleExtractor(http.Client{Timeout: 5 * time.Second})
ex := NewTitleExtractor(http.Client{Timeout: 5 * time.Second}, []string{"127.0.0.1"})
defer ex.Close()
var hits int32
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Expand Down Expand Up @@ -104,7 +104,7 @@ func TestTitle_GetConcurrent(t *testing.T) {
}

func TestTitle_GetFailed(t *testing.T) {
ex := NewTitleExtractor(http.Client{Timeout: 5 * time.Second})
ex := NewTitleExtractor(http.Client{Timeout: 5 * time.Second}, []string{"127.0.0.1"})
defer ex.Close()
var hits int32
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Expand All @@ -124,9 +124,9 @@ func TestTitle_GetFailed(t *testing.T) {
assert.Equal(t, int32(1), atomic.LoadInt32(&hits), "hit once, errors cached")
}

func TestTitle_DoubleClosed(*testing.T) {
ex := NewTitleExtractor(http.Client{Timeout: 5 * time.Second})
ex.Close()
func TestTitle_DoubleClosed(t *testing.T) {
ex := NewTitleExtractor(http.Client{Timeout: 5 * time.Second}, []string{})
assert.NoError(t, ex.Close())
// second call should not result in panic
ex.Close()
assert.NoError(t, ex.Close())
}
Loading