Skip to content

Commit

Permalink
Extract body_domains and body_fqdns to jsonl (#1750)
Browse files Browse the repository at this point in the history
* Extract body-domains and body-fqdns

* remvove port in domains

* Add test for domains extraction

* misc update

* improve domain regex

* fix test

* extract domain inside quotes

* sanitize urls

* fix test

* minor

* do not embed

* remove js variables fp + improve regex

---------

Co-authored-by: Tarun Koyalwar <[email protected]>
  • Loading branch information
RamanaReddy0M and tarunKoyalwar authored Jun 22, 2024
1 parent 72f4c2c commit 9330887
Show file tree
Hide file tree
Showing 10 changed files with 182 additions and 5 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,7 @@ cmd/functional-test/functional-test
cmd/functional-test/httpx
cmd/functional-test/*.cfg

.devcontainer
.devcontainer
/httpx
/dist
/resume.cfg
19 changes: 16 additions & 3 deletions common/httpx/csp.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package httpx

import (
"bytes"
"fmt"
"net/url"
"regexp"
"strings"

"github.com/PuerkitoBio/goquery"
Expand Down Expand Up @@ -64,7 +66,7 @@ func parsePotentialDomains(fqdns, domains map[string]struct{}, data string) {
// we extracts only potential domains
for _, t := range tokens {
if isPotentialDomain(t) {
if dn, err := publicsuffix.Parse(extractDomain(removeWildcards(t))); err == nil {
if dn, err := publicsuffix.Parse(extractDomain(t)); err == nil {
domains[dn.SLD+"."+dn.TLD] = struct{}{}
if dn.TRD != "" {
fqdns[dn.String()] = struct{}{}
Expand All @@ -79,15 +81,17 @@ func isPotentialDomain(s string) bool {
}

func extractDomain(str string) string {
str = removeWildcards(str)
u := str
if !strings.Contains(str, "://") {
u = "https://" + str
}
u = sanitizeURL(u)
parsedURL, err := url.Parse(u)
if err != nil {
return str
return ""
}
return parsedURL.Host
return parsedURL.Hostname()
}

func removeWildcards(domain string) string {
Expand All @@ -108,3 +112,12 @@ func removeWildcards(domain string) string {
}
return strings.Join(parts, ".")
}

var urlInvalidCharRegex = regexp.MustCompile(`[^\w-./:~]`)

func sanitizeURL(u string) string {
// Replace invalid characters with percent-encoded equivalents
return urlInvalidCharRegex.ReplaceAllStringFunc(u, func(match string) string {
return fmt.Sprintf("%%%02X", match[0])
})
}
113 changes: 113 additions & 0 deletions common/httpx/domains.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
package httpx

import (
"regexp"
"strings"
"unicode"

mapsutil "github.com/projectdiscovery/utils/maps"
stringsutil "github.com/projectdiscovery/utils/strings"
"github.com/weppos/publicsuffix-go/publicsuffix"
)

const (
// group 1 is actual domain regex while group 0 and group 2 are used to filter out invalid matches (by skipping irrelevant contexts)
potentialDomainRegex = `(?:^|['"/@])` + `([a-z0-9]+[a-z0-9.-]*\.[a-z]{2,})` + `(?:['"/@]|$)`
)

var (
// potentialDomainsCompiled is a compiled regex for potential domains (aka domain names)
potentialDomainsCompiled = regexp.MustCompile(potentialDomainRegex)
defaultDenylist = []string{".3g2", ".3gp", ".7z", ".apk", ".arj", ".avi", ".axd", ".bmp", ".csv", ".deb", ".dll", ".doc", ".drv", ".eot", ".exe", ".flv", ".gif", ".gifv", ".gz", ".h264", ".ico", ".iso", ".jar", ".jpeg", ".jpg", ".lock", ".m4a", ".m4v", ".map", ".mkv", ".mov", ".mp3", ".mp4", ".mpeg", ".mpg", ".msi", ".ogg", ".ogm", ".ogv", ".otf", ".pdf", ".pkg", ".png", ".ppt", ".psd", ".rar", ".rm", ".rpm", ".svg", ".swf", ".sys", ".tar.gz", ".tar", ".tif", ".tiff", ".ttf", ".txt", ".vob", ".wav", ".webm", ".webp", ".wmv", ".woff", ".woff2", ".xcf", ".xls", ".xlsx", ".zip", ".css", ".js", ".map", ".php", ".sheet", ".ms", ".wp", ".html", ".htm", ".md"}
suffixBlacklist = map[string]struct{}{}
)

type BodyDomain struct {
Fqdns []string `json:"body_fqdn,omitempty"`
Domains []string `json:"body_domains,omitempty"`
}

func (h *HTTPX) BodyDomainGrab(r *Response) *BodyDomain {
domains := make(map[string]struct{})
fqdns := make(map[string]struct{})

for _, tmp := range potentialDomainsCompiled.FindAllStringSubmatch(r.Raw, -1) {
// only interested in 1st group
if len(tmp) < 2 {
continue
}
d := tmp[1]
// minimal + known blacklist
if !isValidDomain(d) {
continue
}
// try to parse its tld
if !isValidTLD(d) {
continue
}
// get domain
val, err := publicsuffix.Domain(d)
if err != nil {
continue
}
if r.Input != val {
domains[val] = struct{}{}
}
if d != val && d != r.Input {
fqdns[d] = struct{}{}
}
}

return &BodyDomain{Domains: mapsutil.GetKeys(domains), Fqdns: mapsutil.GetKeys(fqdns)}
}

func isValidDomain(d string) bool {
parts := strings.Split(d, ".")
if len(parts) < 2 {
return false
}
// this is try when all parts are numeric
// in which this is not a valid domain (could be a ip or something else)
allnumeric := true
// traverse in reverse
for i := len(parts) - 1; i >= 0; i-- {
if _, ok := suffixBlacklist["."+parts[i]]; ok {
return false
}
// check for numeric
local:
for _, c := range parts[i] {
if !unicode.IsDigit(c) {
allnumeric = false
break local
}
}
}

if allnumeric {
// not a domain could be ip or something else
return false
}

// simple hack for android/ios package name
if stringsutil.HasPrefixAny(d, "com", "net", "io", "org") && !stringsutil.HasSuffixAny(d, "com", "net", "io", "org") {
return false
}
return true
}

func isValidTLD(domain string) bool {
rule := publicsuffix.DefaultList.Find(domain, publicsuffix.DefaultFindOptions)
if rule == nil || rule.Type != publicsuffix.NormalType {
return false
}

_, err := publicsuffix.ParseFromListWithOptions(publicsuffix.DefaultList, domain, &publicsuffix.FindOptions{DefaultRule: rule})
return err == nil
}

func init() {
for _, s := range defaultDenylist {
suffixBlacklist[s] = struct{}{}
}
}
29 changes: 29 additions & 0 deletions common/httpx/domains_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package httpx

import (
_ "embed"
"sort"
"testing"

"github.com/stretchr/testify/require"
)

//go:embed test-data/hackerone.html
var rawResponse string

func TestBodyGrabDoamins(t *testing.T) {
ht, err := New(&DefaultOptions)
require.Nil(t, err)
resposne := &Response{
Raw: rawResponse,
}
bd := ht.BodyDomainGrab(resposne)

sort.Strings(bd.Domains)
sort.Strings(bd.Fqdns)

t.Run("body domain grab", func(t *testing.T) {
require.Equal(t, 24, len(bd.Domains))
require.Equal(t, 16, len(bd.Fqdns))
})
}
2 changes: 2 additions & 0 deletions common/httpx/httpx.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ get_response:
}

var resp Response
resp.Input = req.Host

resp.Headers = httpresp.Header.Clone()

Expand Down Expand Up @@ -313,6 +314,7 @@ get_response:

if h.Options.ExtractFqdn {
resp.CSPData = h.CSPGrab(&resp)
resp.BodyDomains = h.BodyDomainGrab(&resp)
}

// build the redirect flow by reverse cycling the response<-request chain
Expand Down
2 changes: 1 addition & 1 deletion common/httpx/httpx_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,6 @@ func TestDo(t *testing.T) {
require.Nil(t, err)
resp, err := ht.Do(req, UnsafeOptions{})
require.Nil(t, err)
require.Equal(t, 318, resp.ContentLength)
require.Greater(t, len(resp.Raw), 800)
})
}
2 changes: 2 additions & 0 deletions common/httpx/response.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

// Response contains the response to a server
type Response struct {
Input string // input that was given
StatusCode int
Headers map[string][]string
RawData []byte // undecoded data
Expand All @@ -21,6 +22,7 @@ type Response struct {
Lines int
TLSData *clients.Response
CSPData *CSPData
BodyDomains *BodyDomain
HTTP2 bool
Pipeline bool
Duration time.Duration
Expand Down
9 changes: 9 additions & 0 deletions common/httpx/test-data/hackerone.html

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -2211,6 +2211,10 @@ retry:
RequestRaw: requestDump,
Response: resp,
}
if resp.BodyDomains != nil {
result.Fqdns = resp.BodyDomains.Fqdns
result.Domains = resp.BodyDomains.Domains
}
return result
}

Expand Down
2 changes: 2 additions & 0 deletions runner/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ type Result struct {
ScreenshotPathRel string `json:"screenshot_path_rel,omitempty" csv:"screenshot_path_rel"`
KnowledgeBase map[string]interface{} `json:"knowledgebase,omitempty" csv:"knowledgebase"`
Resolvers []string `json:"resolvers,omitempty" csv:"resolvers"`
Fqdns []string `json:"body_fqdn,omitempty"`
Domains []string `json:"body_domains,omitempty"`

// Internal Fields
TechnologyDetails map[string]wappalyzer.AppInfo `json:"-" csv:"-"`
Expand Down

0 comments on commit 9330887

Please sign in to comment.