From bab67139bcc120321d21a1fc793e5b0d9d37c141 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 4 Dec 2023 15:00:14 +0000 Subject: [PATCH 1/7] chore(deps): bump github.com/spf13/cast from 1.5.1 to 1.6.0 Bumps [github.com/spf13/cast](https://github.com/spf13/cast) from 1.5.1 to 1.6.0. - [Release notes](https://github.com/spf13/cast/releases) - [Commits](https://github.com/spf13/cast/compare/v1.5.1...v1.6.0) --- updated-dependencies: - dependency-name: github.com/spf13/cast dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 3584b9f..11c5778 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,7 @@ require ( github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f github.com/hueristiq/hqgourl v0.0.0-20230821112831-e12f907b5a53 github.com/logrusorgru/aurora/v3 v3.0.0 - github.com/spf13/cast v1.5.1 + github.com/spf13/cast v1.6.0 github.com/spf13/pflag v1.0.5 github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80 gopkg.in/yaml.v3 v3.0.1 diff --git a/go.sum b/go.sum index 6c34dd3..d440eb1 100644 --- a/go.sum +++ b/go.sum @@ -3,7 +3,7 @@ dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= github.com/Mzack9999/go-http-digest-auth-client v0.6.0 h1:LXVNMsj7qiNVmlZByFbjJmXf6SOm/uoo04XmnNcWPms= github.com/Mzack9999/go-http-digest-auth-client v0.6.0/go.mod h1:gbwaYYXwA15ZfIxMyY5QU1acATDyNKEuG5TylBCL7AM= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/frankban/quicktest v1.14.4 h1:g2rn0vABPOOXmZUj+vbmUp0lPoXEMuhTpIluN0XL9UY= +github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/hueristiq/hqgohttp v0.0.0-20231024010818-fdb48fa4aead h1:Iep2G2h3hSwc7w0qr1iVVAptgXqjn7DRXVQ33luPmhk= github.com/hueristiq/hqgohttp v0.0.0-20231024010818-fdb48fa4aead/go.mod h1:Faf/mOhyfNnLIfhoYj2vfPrjt0nKBr4WaU+OQ0C7r6U= @@ -21,8 +21,8 @@ github.com/logrusorgru/aurora/v3 v3.0.0 h1:R6zcoZZbvVcGMvDCKo45A9U/lzYyzl5NfYIvz github.com/logrusorgru/aurora/v3 v3.0.0/go.mod h1:vsR12bk5grlLvLXAYrBsb5Oc/N+LxAlxggSjiwMnCUc= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= -github.com/spf13/cast v1.5.1 h1:R+kOtfhWQE6TVQzY+4D7wJLBgkdVasCEFxSUBYBYIlA= -github.com/spf13/cast v1.5.1/go.mod h1:b9PdjNptOpzXr7Rq1q9gJML/2cdGQAo69NKzQ10KN48= +github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0= +github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= From 3c59f0580b61eef4b8dc6f8d85f97f911e73a952 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Dec 2023 14:21:43 +0000 Subject: [PATCH 2/7] chore(deps): bump actions/setup-go from 4 to 5 Bumps [actions/setup-go](https://github.com/actions/setup-go) from 4 to 5. - [Release notes](https://github.com/actions/setup-go/releases) - [Commits](https://github.com/actions/setup-go/compare/v4...v5) --- updated-dependencies: - dependency-name: actions/setup-go dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/build-test.yml | 2 +- .github/workflows/lint-test.yml | 2 +- .github/workflows/release.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 5216c78..e2a67cb 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -25,7 +25,7 @@ jobs: steps: - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version: '>=1.21' - diff --git a/.github/workflows/lint-test.yml b/.github/workflows/lint-test.yml index 88f9cdc..a6c52d5 100644 --- a/.github/workflows/lint-test.yml +++ b/.github/workflows/lint-test.yml @@ -25,7 +25,7 @@ jobs: steps: - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version: '>=1.21' cache: false diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 2334759..eff1f39 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,7 +14,7 @@ jobs: steps: - name: Set up Go - uses: actions/setup-go@v4 + uses: actions/setup-go@v5 with: go-version: '>=1.21' - From 43ae00abf9da87bf2fd0b712b30b996dda2921f4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 18 Dec 2023 14:39:23 +0000 Subject: [PATCH 3/7] chore(deps): bump github/codeql-action from 2 to 3 Bumps [github/codeql-action](https://github.com/github/codeql-action) from 2 to 3. - [Release notes](https://github.com/github/codeql-action/releases) - [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/github/codeql-action/compare/v2...v3) --- updated-dependencies: - dependency-name: github/codeql-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/codeql-analysis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index a978f59..40c0cac 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -35,12 +35,12 @@ jobs: uses: actions/checkout@v4 - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 \ No newline at end of file + uses: github/codeql-action/analyze@v3 \ No newline at end of file From 30548bc1a07c5634e184802afa8cba24d25413b4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 14:53:51 +0000 Subject: [PATCH 4/7] chore(deps): bump golangci/golangci-lint-action from 3 to 4 Bumps [golangci/golangci-lint-action](https://github.com/golangci/golangci-lint-action) from 3 to 4. - [Release notes](https://github.com/golangci/golangci-lint-action/releases) - [Commits](https://github.com/golangci/golangci-lint-action/compare/v3...v4) --- updated-dependencies: - dependency-name: golangci/golangci-lint-action dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- .github/workflows/lint-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint-test.yml b/.github/workflows/lint-test.yml index 88f9cdc..260fb59 100644 --- a/.github/workflows/lint-test.yml +++ b/.github/workflows/lint-test.yml @@ -36,7 +36,7 @@ jobs: fetch-depth: 0 - name: Run golangci-lint - uses: golangci/golangci-lint-action@v3 + uses: golangci/golangci-lint-action@v4 with: version: v1.54.2 args: --timeout 5m From 095538dd11308d2fef886f18cb57025975cf976a Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 7 Feb 2024 13:23:22 +0300 Subject: [PATCH 5/7] docs(*): Update config location --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a4bfa96..cf5a629 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ go install -v github.com/hueristiq/xurlfind3r/cmd/xurlfind3r@latest ## Post Installation -`xurlfind3r` will work right after [installation](#installation). However, **[BeVigil](https://bevigil.com)**, **[Github](https://github.com)** and **[Intelligence X](https://intelx.io)** require API keys to work, **[URLScan](https://urlscan.io)** supports API key but not required. The API keys are stored in the `$HOME/.hueristiq/xurlfind3r/config.yaml` file - created upon first run - and uses the YAML format. Multiple API keys can be specified for each of these source from which one of them will be used. +`xurlfind3r` will work right after [installation](#installation). However, **[BeVigil](https://bevigil.com)**, **[Github](https://github.com)** and **[Intelligence X](https://intelx.io)** require API keys to work, **[URLScan](https://urlscan.io)** supports API key but not required. The API keys are stored in the `$HOME/.config/xurlfind3r/config.yaml` file - created upon first run - and uses the YAML format. Multiple API keys can be specified for each of these source from which one of them will be used. Example `config.yaml`: From 7c2d7c4e626d657a762cf76fd827930401ef4f56 Mon Sep 17 00:00:00 2001 From: "Alex Munene (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Wed, 7 Feb 2024 13:26:20 +0300 Subject: [PATCH 6/7] build(*): Refactor makefile --- Makefile | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index c1957ef..d21bae4 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,8 @@ -# Go(Golang) Options +SHELL = /bin/bash + +all: go-build + +# --- Go(Golang) ------------------------------------------------------------------------------------ GOCMD=go GOMOD=$(GOCMD) mod GOGET=$(GOCMD) get @@ -9,44 +13,38 @@ GOINSTALL=$(GOCMD) install GOFLAGS := -v LDFLAGS := -s -w -# Golangci Options -GOLANGCILINTCMD=golangci-lint -GOLANGCILINTRUN=$(GOLANGCILINTCMD) run - ifneq ($(shell go env GOOS),darwin) LDFLAGS := -extldflags "-static" endif -all: build +GOLANGCILINTCMD=golangci-lint +GOLANGCILINTRUN=$(GOLANGCILINTCMD) run -.PHONY: tidy -tidy: +.PHONY: go-mod-tidy +go-mod-tidy: $(GOMOD) tidy -.PHONY: update-deps -update-deps: +.PHONY: go-mod-update +go-mod-update: $(GOGET) -f -t -u ./... $(GOGET) -f -u ./... -.PHONY: _gofmt -_gofmt: +.PHONY: go-fmt +go-fmt: $(GOFMT) ./... -.PHONY: _golangci-lint -_golangci-lint: +.PHONY: go-lint +go-lint: go-fmt $(GOLANGCILINTRUN) $(GOLANGCILINT) ./... -.PHONY: lint -lint: _gofmt _golangci-lint - -.PHONY: test -test: +.PHONY: go-test +go-test: $(GOTEST) $(GOFLAGS) ./... -.PHONY: build -build: +.PHONY: go-build +go-build: $(GOBUILD) $(GOFLAGS) -ldflags '$(LDFLAGS)' -o bin/xurlfind3r cmd/xurlfind3r/main.go -.PHONY: install -install: - $(GOINSTALL) $(GOFLAGS) ./... +.PHONY: go-install +go-install: + $(GOINSTALL) $(GOFLAGS) ./... \ No newline at end of file From ddbd111fa2afff6cce85a291ec988ced7ff9d7be Mon Sep 17 00:00:00 2001 From: "Alex (@enenumxela)" <62714471+enenumxela@users.noreply.github.com> Date: Mon, 14 Oct 2024 20:46:26 +0300 Subject: [PATCH 7/7] chore(*): - --- .dockerignore | 8 + .../workflows/{build-test.yml => build.yml} | 6 +- .../{codeql-analysis.yml => codeql.yml} | 20 +- .github/workflows/dockerhub-push.yaml | 44 +++ .github/workflows/{lint-test.yml => lint.yml} | 12 +- .github/workflows/release.yml | 2 +- .gitignore | 2 - .golangci.yaml | 373 ++++++++++++------ .vscode/extenstions.json | 5 + .vscode/settings.json | 6 + Dockerfile | 67 ++++ Makefile | 130 +++++- README.md | 108 +++-- cmd/xurlfind3r/main.go | 123 +++--- go.mod | 38 +- go.sum | 88 +++-- internal/configuration/configuration.go | 15 +- pkg/httpclient/client.go | 13 +- pkg/scraper/scraper.go | 135 ------- pkg/scraper/sources/sources.go | 50 --- pkg/scraper/sources/wayback/wayback.go | 243 ------------ pkg/scraper/sources/wayback/waybackrobots.go | 112 ------ pkg/scraper/sources/wayback/waybacksource.go | 187 --------- .../sources/bevigil/bevigil.go | 35 +- .../sources/commoncrawl/commoncrawl.go | 45 +-- pkg/xurlfind3r/sources/configuration.go | 66 ++++ .../sources/github/github.go | 75 ++-- .../sources/github/tokenmanager.go | 0 .../sources/intelx/intelx.go | 42 +- .../sources/otx/otx.go | 37 +- pkg/xurlfind3r/sources/result.go | 19 + pkg/xurlfind3r/sources/sources.go | 39 ++ .../sources/urlscan/urlscan.go | 30 +- pkg/{scraper => xurlfind3r}/sources/utils.go | 54 --- pkg/xurlfind3r/sources/utils_test.go | 68 ++++ pkg/xurlfind3r/sources/wayback/wayback.go | 102 +++++ pkg/xurlfind3r/xurlfind3r.go | 223 +++++++++++ 37 files changed, 1355 insertions(+), 1267 deletions(-) create mode 100644 .dockerignore rename .github/workflows/{build-test.yml => build.yml} (91%) rename .github/workflows/{codeql-analysis.yml => codeql.yml} (83%) create mode 100644 .github/workflows/dockerhub-push.yaml rename .github/workflows/{lint-test.yml => lint.yml} (75%) create mode 100644 .vscode/extenstions.json create mode 100644 .vscode/settings.json create mode 100644 Dockerfile delete mode 100644 pkg/scraper/scraper.go delete mode 100644 pkg/scraper/sources/sources.go delete mode 100644 pkg/scraper/sources/wayback/wayback.go delete mode 100644 pkg/scraper/sources/wayback/waybackrobots.go delete mode 100644 pkg/scraper/sources/wayback/waybacksource.go rename pkg/{scraper => xurlfind3r}/sources/bevigil/bevigil.go (69%) rename pkg/{scraper => xurlfind3r}/sources/commoncrawl/commoncrawl.go (81%) create mode 100644 pkg/xurlfind3r/sources/configuration.go rename pkg/{scraper => xurlfind3r}/sources/github/github.go (76%) rename pkg/{scraper => xurlfind3r}/sources/github/tokenmanager.go (100%) rename pkg/{scraper => xurlfind3r}/sources/intelx/intelx.go (84%) rename pkg/{scraper => xurlfind3r}/sources/otx/otx.go (69%) create mode 100644 pkg/xurlfind3r/sources/result.go create mode 100644 pkg/xurlfind3r/sources/sources.go rename pkg/{scraper => xurlfind3r}/sources/urlscan/urlscan.go (80%) rename pkg/{scraper => xurlfind3r}/sources/utils.go (70%) create mode 100644 pkg/xurlfind3r/sources/utils_test.go create mode 100644 pkg/xurlfind3r/sources/wayback/wayback.go create mode 100644 pkg/xurlfind3r/xurlfind3r.go diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..ae1ecbc --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +.github +bin +.gitignore +.golangci.yaml +.goreleaser.yaml +CONTRIBUTING.md +LICENSE +README.md \ No newline at end of file diff --git a/.github/workflows/build-test.yml b/.github/workflows/build.yml similarity index 91% rename from .github/workflows/build-test.yml rename to .github/workflows/build.yml index e2a67cb..20b722b 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build.yml @@ -1,4 +1,4 @@ -name: 🔨 Build Test +name: 🔨 Build on: push: @@ -17,7 +17,7 @@ on: jobs: build: - name: Build Test + name: Build strategy: matrix: os: [ubuntu-latest, windows-latest, macOS-12] @@ -27,7 +27,7 @@ jobs: name: Set up Go uses: actions/setup-go@v5 with: - go-version: '>=1.21' + go-version: '>=1.23' - name: Checkout the repository uses: actions/checkout@v4 diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql.yml similarity index 83% rename from .github/workflows/codeql-analysis.yml rename to .github/workflows/codeql.yml index 40c0cac..5c29437 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql.yml @@ -1,4 +1,4 @@ -name: 🚨 CodeQL Analysis +name: 🚨 Analyze Code (CodeQL) on: push: @@ -17,22 +17,22 @@ on: jobs: analyze: - name: CodeQL Analysis + name: Analyze Code (CodeQL) + strategy: + fail-fast: false + matrix: + language: [ 'go' ] runs-on: ubuntu-latest permissions: actions: read contents: read security-events: write - - strategy: - fail-fast: false - matrix: - language: [ 'go' ] - steps: - - - name: Checkout repository + - + name: Checkout the repository uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Initialize CodeQL uses: github/codeql-action/init@v3 diff --git a/.github/workflows/dockerhub-push.yaml b/.github/workflows/dockerhub-push.yaml new file mode 100644 index 0000000..280bb73 --- /dev/null +++ b/.github/workflows/dockerhub-push.yaml @@ -0,0 +1,44 @@ +name: 🐋 DockerHub Push + +on: + workflow_run: + workflows: ["🎉 Release"] + types: + - completed + workflow_dispatch: + +jobs: + push: + name: DockerHub Push + runs-on: ubuntu-latest + permissions: + packages: write + contents: read + attestations: write + id-token: write + steps: + - + name: Checkout + uses: actions/checkout@v4 + + - + name: Get Github tag + id: meta + run: | + curl --silent "https://api.github.com/repos/hueristiq/xurlfind3r/releases/latest" | jq -r .tag_name | xargs -I {} echo TAG={} >> $GITHUB_OUTPUT + + - + name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - + name: Build and push Docker image + uses: docker/build-push-action@v6 + with: + context: . + file: ./Dockerfile + push: true + tags: hueristiq/xurlfind3r:latest,hueristiq/xurlfind3r:${{ steps.meta.outputs.TAG }} diff --git a/.github/workflows/lint-test.yml b/.github/workflows/lint.yml similarity index 75% rename from .github/workflows/lint-test.yml rename to .github/workflows/lint.yml index c9d603d..5239923 100644 --- a/.github/workflows/lint-test.yml +++ b/.github/workflows/lint.yml @@ -1,4 +1,4 @@ -name: 💅 Lint Test +name: 💅🏻 Lint on: push: @@ -20,14 +20,14 @@ permissions: jobs: lint: - name: Lint Test + name: Lint runs-on: ubuntu-latest steps: - name: Set up Go uses: actions/setup-go@v5 with: - go-version: '>=1.21' + go-version: '>=1.23' cache: false - name: Checkout the repository @@ -36,8 +36,12 @@ jobs: fetch-depth: 0 - name: Run golangci-lint +<<<<<<< HEAD:.github/workflows/lint-test.yml uses: golangci/golangci-lint-action@v4 +======= + uses: golangci/golangci-lint-action@v6 +>>>>>>> 0d0d68f (chore(*): -):.github/workflows/lint.yml with: - version: v1.54.2 + version: v1.61.0 args: --timeout 5m working-directory: . \ No newline at end of file diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index eff1f39..5ad1775 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -16,7 +16,7 @@ jobs: name: Set up Go uses: actions/setup-go@v5 with: - go-version: '>=1.21' + go-version: '>=1.23' - name: Checkout the repository uses: actions/checkout@v4 diff --git a/.gitignore b/.gitignore index 77a98d4..c5e82d7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1 @@ -# Executable - bin \ No newline at end of file diff --git a/.golangci.yaml b/.golangci.yaml index 3b880ee..0e6d4e6 100644 --- a/.golangci.yaml +++ b/.golangci.yaml @@ -1,122 +1,232 @@ +# Options for analysis running. run: + # Number of operating system threads (`GOMAXPROCS`) that can execute golangci-lint simultaneously. + # If it is explicitly set to 0 (i.e. not the default) then golangci-lint will automatically set the value to match Linux container CPU quota. + # Default: the number of logical CPUs in the machine + # concurrency: 4 # Timeout for analysis, e.g. 30s, 5m. # Default: 1m timeout: 5m + # Exit code when at least one issue was found. + # Default: 1 + issues-exit-code: 1 + # Include test files or not. + # Default: true + tests: true + # List of build tags, all linters use it. + # Default: [] + build-tags: [] + # If set, we pass it to "go list -mod={option}". From "go help modules": + # If invoked with -mod=readonly, the go command is disallowed from the implicit + # automatic updating of go.mod described above. Instead, it fails when any changes + # to go.mod are needed. This setting is most useful to check that go.mod does + # not need updates, such as in a continuous integration and testing system. + # If invoked with -mod=vendor, the go command assumes that the vendor + # directory holds the correct copies of dependencies and ignores + # the dependency descriptions in go.mod. + # + # Allowed values: readonly|vendor|mod + # Default: "" + modules-download-mode: readonly + # Allow multiple parallel golangci-lint instances running. + # If false, golangci-lint acquires file lock on start. + # Default: false + allow-parallel-runners: true + # Allow multiple golangci-lint instances running, but serialize them around a lock. + # If false, golangci-lint exits with an error if it fails to acquire file lock on start. + # Default: false + allow-serial-runners: true + # Define the Go version limit. + # Mainly related to generics support since go1.18. + # Default: use Go version from the go.mod file, fallback on the env var `GOVERSION`, fallback on 1.17 + go: '1.23' + +# output configuration options +output: + # The formats used to render issues. + # Formats: + # - `colored-line-number` + # - `line-number` + # - `json` + # - `colored-tab` + # - `tab` + # - `html` + # - `checkstyle` + # - `code-climate` + # - `junit-xml` + # - `junit-xml-extended` + # - `github-actions` + # - `teamcity` + # - `sarif` + # Output path can be either `stdout`, `stderr` or path to the file to write to. + # + # For the CLI flag (`--out-format`), multiple formats can be specified by separating them by comma. + # The output can be specified for each of them by separating format name and path by colon symbol. + # Example: "--out-format=checkstyle:report.xml,json:stdout,colored-line-number" + # The CLI flag (`--out-format`) override the configuration file. + # + # Default: + # formats: + # - format: colored-line-number + # path: stdout + formats: + # - + # format: json + # path: stderr + # - + # format: checkstyle + # path: report.xml + - + format: colored-line-number + path: stderr + # Print lines of code with issue. + # Default: true + print-issued-lines: true + # Print linter name in the end of issue text. + # Default: true + print-linter-name: true + # Make issues output unique by line. + # Default: true + uniq-by-line: false + # Add a prefix to the output file references. + # Default: "" + path-prefix: "" + # Sort results by the order defined in `sort-order`. + # Default: false + sort-results: true + # Order to use when sorting results. + # Require `sort-results` to `true`. + # Possible values: `file`, `linter`, and `severity`. + # + # If the severity values are inside the following list, they are ordered in this order: + # 1. error + # 2. warning + # 3. high + # 4. medium + # 5. low + # Either they are sorted alphabetically. + # + # Default: ["file"] + sort-order: + - linter + - severity + - file # filepath, line, and column. + # Show statistics per linter. + # Default: false + show-stats: false linters: # Disable all linters. # Default: false disable-all: true # Enable specific linter + # https://golangci-lint.run/usage/linters/#enabled-by-default enable: - # Enabled by Default - - errcheck # errcheck is a program for checking for unchecked errors in Go code. These unchecked errors can be critical bugs in some cases [fast: false, auto-fix: false] - - gosimple # (megacheck) # Linter for Go source code that specializes in simplifying code [fast: false, auto-fix: false] - - govet # (vet, vetshadow) # Vet examines Go source code and reports suspicious constructs, such as Printf calls whose arguments do not align with the format string [fast: false, auto-fix: false] - - ineffassign # Detects when assignments to existing variables are not used [fast: true, auto-fix: false] - - staticcheck # (megacheck) # It's a set of rules from staticcheck. It's not the same thing as the staticcheck binary. The author of staticcheck doesn't support or approve the use of staticcheck as a library inside golangci-lint. [fast: false, auto-fix: false] - - unused # (megacheck) # Checks Go code for unused constants, variables, functions and types [fast: false, auto-fix: false] - # Disabled by Default - - asasalint # check for pass []any as any in variadic func(...any) [fast: false, auto-fix: false] - - asciicheck # Simple linter to check that your code does not contain non-ASCII identifiers [fast: true, auto-fix: false] - - bidichk # Checks for dangerous unicode character sequences [fast: true, auto-fix: false] - - bodyclose # checks whether HTTP response body is closed successfully [fast: false, auto-fix: false] - - containedctx # containedctx is a linter that detects struct contained context.Context field [fast: false, auto-fix: false] - - contextcheck # check whether the function uses a non-inherited context [fast: false, auto-fix: false] - # - cyclop # checks function and package cyclomatic complexity [fast: false, auto-fix: false] - # - deadcode # [deprecated] # Finds unused code [fast: false, auto-fix: false] - - decorder # check declaration order and count of types, constants, variables and functions [fast: true, auto-fix: false] - # - depguard # Go linter that checks if package imports are in a list of acceptable packages [fast: true, auto-fix: false] - - dogsled # Checks assignments with too many blank identifiers (e.g. x, _, _, _, := f()) [fast: true, auto-fix: false] - # - dupl # Tool for code clone detection [fast: true, auto-fix: false] - - dupword # checks for duplicate words in the source code [fast: true, auto-fix: true] - - durationcheck # check for two durations multiplied together [fast: false, auto-fix: false] - - errchkjson # Checks types passed to the json encoding functions. Reports unsupported types and optionally reports occasions, where the check for the returned error can be omitted. [fast: false, auto-fix: false] - - errname # Checks that sentinel errors are prefixed with the `Err` and error types are suffixed with the `Error`. [fast: false, auto-fix: false] - - errorlint # errorlint is a linter for that can be used to find code that will cause problems with the error wrapping scheme introduced in Go 1.13. [fast: false, auto-fix: false] - - execinquery # execinquery is a linter about query string checker in Query function which reads your Go src files and warning it finds [fast: false, auto-fix: false] - - exhaustive # check exhaustiveness of enum switch statements [fast: false, auto-fix: false] - # - exhaustivestruct # [deprecated] # Checks if all struct's fields are initialized [fast: false, auto-fix: false] - # - exhaustruct # Checks if all structure fields are initialized [fast: false, auto-fix: false] - - exportloopref # checks for pointers to enclosing loop variables [fast: false, auto-fix: false] - # - forbidigo # Forbids identifiers [fast: false, auto-fix: false] - - forcetypeassert # finds forced type assertions [fast: true, auto-fix: false] - # - funlen # Tool for detection of long functions [fast: true, auto-fix: false] - - gci # Gci controls Go package import order and makes it always deterministic. [fast: true, auto-fix: false] - - ginkgolinter # enforces standards of using ginkgo and gomega [fast: false, auto-fix: false] - - gocheckcompilerdirectives # Checks that go compiler directive comments (//go:) are valid. [fast: true, auto-fix: false] - # - gochecknoglobals # check that no global variables exist [fast: false, auto-fix: false] - # - gochecknoinits # Checks that no init functions are present in Go code [fast: true, auto-fix: false] - # - gocognit # Computes and checks the cognitive complexity of functions [fast: true, auto-fix: false] - - goconst # Finds repeated strings that could be replaced by a constant [fast: true, auto-fix: false] - - gocritic # Provides diagnostics that check for bugs, performance and style issues. [fast: false, auto-fix: false] - # - gocyclo # Computes and checks the cyclomatic complexity of functions [fast: true, auto-fix: false] - # - godot # Check if comments end in a period [fast: true, auto-fix: true] - # - godox # Tool for detection of FIXME, TODO and other comment keywords [fast: true, auto-fix: false] - # - goerr113 # Go linter to check the errors handling expressions [fast: false, auto-fix: false] - - gofmt # Gofmt checks whether code was gofmt-ed. By default this tool runs with -s option to check for code simplification [fast: true, auto-fix: true] - - gofumpt # Gofumpt checks whether code was gofumpt-ed. [fast: true, auto-fix: true] - - goheader # Checks is file header matches to pattern [fast: true, auto-fix: false] - - goimports # Check import statements are formatted according to the 'goimport' command. Reformat imports in autofix mode. [fast: true, auto-fix: true] - # - golint # [deprecated] # Golint differs from gofmt. Gofmt reformats Go source code, whereas golint prints out style mistakes [fast: false, auto-fix: false] - # - gomnd # An analyzer to detect magic numbers. [fast: true, auto-fix: false] - - gomoddirectives # Manage the use of 'replace', 'retract', and 'excludes' directives in go.mod. [fast: true, auto-fix: false] - - gomodguard # Allow and block list linter for direct Go module dependencies. This is different from depguard where there are different block types for example version constraints and module recommendations. [fast: true, auto-fix: false] - - goprintffuncname # Checks that printf-like functions are named with `f` at the end [fast: true, auto-fix: false] - # - gosec # (gas) # Inspects source code for security problems [fast: false, auto-fix: false] - - gosmopolitan # Report certain i18n/l10n anti-patterns in your Go codebase [fast: false, auto-fix: false] - - grouper # An analyzer to analyze expression groups. [fast: true, auto-fix: false] - # - ifshort # [deprecated] # Checks that your code uses short syntax for if-statements whenever possible [fast: true, auto-fix: false] - - importas # Enforces consistent import aliases [fast: false, auto-fix: false] - - interfacebloat # A linter that checks the number of methods inside an interface. [fast: true, auto-fix: false] - # - interfacer # [deprecated] # Linter that suggests narrower interface types [fast: false, auto-fix: false] - # - ireturn # Accept Interfaces, Return Concrete Types [fast: false, auto-fix: false] - # - lll # Reports long lines [fast: true, auto-fix: false] - - loggercheck # (logrlint) # Checks key value pairs for common logger libraries (kitlog,klog,logr,zap). [fast: false, auto-fix: false] - # - maintidx # maintidx measures the maintainability index of each function. [fast: true, auto-fix: false] - - makezero # Finds slice declarations with non-zero initial length [fast: false, auto-fix: false] - # - maligned # [deprecated] # Tool to detect Go structs that would take less memory if their fields were sorted [fast: false, auto-fix: false] - - mirror # reports wrong mirror patterns of bytes/strings usage [fast: false, auto-fix: false] - - misspell # Finds commonly misspelled English words in comments [fast: true, auto-fix: true] - - musttag # enforce field tags in (un)marshaled structs [fast: false, auto-fix: false] - # - nakedret # Finds naked returns in functions greater than a specified function length [fast: true, auto-fix: false] - - nestif # Reports deeply nested if statements [fast: true, auto-fix: false] - - nilerr # Finds the code that returns nil even if it checks that the error is not nil. [fast: false, auto-fix: false] - - nilnil # Checks that there is no simultaneous return of `nil` error and an invalid value. [fast: false, auto-fix: false] - - nlreturn # nlreturn checks for a new line before return and branch statements to increase code clarity [fast: true, auto-fix: false] - - noctx # noctx finds sending http request without context.Context [fast: false, auto-fix: false] - - nolintlint # Reports ill-formed or insufficient nolint directives [fast: true, auto-fix: false] - # - nonamedreturns # Reports all named returns [fast: false, auto-fix: false] - # - nosnakecase # [deprecated] # nosnakecase is a linter that detects snake case of variable naming and function name. [fast: true, auto-fix: false] - - nosprintfhostport # Checks for misuse of Sprintf to construct a host with port in a URL. [fast: true, auto-fix: false] - - paralleltest # paralleltest detects missing usage of t.Parallel() method in your Go test [fast: false, auto-fix: false] - - prealloc # Finds slice declarations that could potentially be pre-allocated [fast: true, auto-fix: false] - - predeclared # find code that shadows one of Go's predeclared identifiers [fast: true, auto-fix: false] - # - promlinter # Check Prometheus metrics naming via promlint [fast: true, auto-fix: false] - - reassign # Checks that package variables are not reassigned [fast: false, auto-fix: false] - - revive # Fast, configurable, extensible, flexible, and beautiful linter for Go. Drop-in replacement of golint. [fast: false, auto-fix: false] - - rowserrcheck # checks whether Err of rows is checked successfully [fast: false, auto-fix: false] - # - scopelint # [deprecated] # Scopelint checks for unpinned variables in go programs [fast: true, auto-fix: false] - - sqlclosecheck # Checks that sql.Rows and sql.Stmt are closed. [fast: false, auto-fix: false] - # - structcheck # [deprecated] # Finds unused struct fields [fast: false, auto-fix: false] - - stylecheck # Stylecheck is a replacement for golint [fast: false, auto-fix: false] - - tagalign # check that struct tags are well aligned [fast: true, auto-fix: true] - # - tagliatelle # Checks the struct tags. [fast: true, auto-fix: false] - - tenv # tenv is analyzer that detects using os.Setenv instead of t.Setenv since Go1.17 [fast: false, auto-fix: false] - - testableexamples # linter checks if examples are testable (have an expected output) [fast: true, auto-fix: false] - - testpackage # linter that makes you use a separate _test package [fast: true, auto-fix: false] - - thelper # thelper detects Go test helpers without t.Helper() call and checks the consistency of test helpers [fast: false, auto-fix: false] - - tparallel # tparallel detects inappropriate usage of t.Parallel() method in your Go test codes [fast: false, auto-fix: false] - - unconvert # Remove unnecessary type conversions [fast: false, auto-fix: false] - - unparam # Reports unused function parameters [fast: false, auto-fix: false] - - usestdlibvars # A linter that detect the possibility to use variables/constants from the Go standard library. [fast: true, auto-fix: false] - # - varcheck # [deprecated] # Finds unused global variables and constants [fast: false, auto-fix: false] - # - varnamelen # checks that the length of a variable's name matches its scope [fast: false, auto-fix: false] - - wastedassign # wastedassign finds wasted assignment statements. [fast: false, auto-fix: false] - - whitespace # Tool for detection of leading and trailing whitespace [fast: true, auto-fix: true] - # - wrapcheck # Checks that errors returned from external packages are wrapped [fast: false, auto-fix: false] - - wsl # Whitespace Linter - Forces you to use empty lines! [fast: true, auto-fix: false] - - zerologlint # Detects the wrong usage of `zerolog` that a user forgets to dispatch with `Send` or `Msg`. [fast: false, auto-fix: false] + - asasalint + - asciicheck + - bidichk + - bodyclose + - canonicalheader + - containedctx + - contextcheck + - copyloopvar + # - cyclop + - decorder + # - depguard + - dogsled + - dupl + - dupword + - durationcheck + - err113 + - errcheck + - errchkjson + - errname + - errorlint + - exhaustive + # - exhaustruct + - fatcontext + - forbidigo + - forcetypeassert + # - funlen + - gci + - ginkgolinter + - gocheckcompilerdirectives + # - gochecknoglobals + # - gochecknoinits + - gochecksumtype + # - gocognit + - goconst + - gocritic + # - gocyclo + - godot + - godox + - gofmt + - gofumpt + - goheader + - goimports + - gomoddirectives + - gomodguard + - goprintffuncname + - gosec + - gosimple + - gosmopolitan + - govet + - grouper + - importas + - inamedparam + - ineffassign + - interfacebloat + - intrange + - ireturn + # - lll + - loggercheck + - maintidx + - makezero + - mirror + - misspell + # - mnd + - musttag + # - nakedret + - nestif + - nilerr + - nilnil + - nlreturn + - noctx + - nolintlint + # - nonamedreturns + - nosprintfhostport + - paralleltest + # - perfsprint + - prealloc + - predeclared + - promlinter + - protogetter + - reassign + - revive + - rowserrcheck + - sloglint + - spancheck + - sqlclosecheck + - staticcheck + - stylecheck + - tagalign + # - tagliatelle + - tenv + - testableexamples + - testifylint + - testpackage + - thelper + - tparallel + - unconvert + - unparam + - unused + - usestdlibvars + # - varnamelen + - wastedassign + - whitespace + - wrapcheck + - wsl + - zerologlint linters-settings: goconst: @@ -135,26 +245,39 @@ linters-settings: # Minimal code complexity to report. # Default: 30 (but we recommend 10-20) min-complexity: 10 - govet: - check-shadowing: true - varnamelen: - # The minimum length of a variable's name that is considered "long". - # Variable names that are at least this long will be ignored. - # Default: 3 - min-name-length: 2 - # Check method receivers. - # Default: false - check-receiver: true - # Check named return values. - # Default: false - check-return: true - # Check type parameters. - # Default: false - check-type-param: true + # varnamelen: + # # The minimum length of a variable's name that is considered "long". + # # Variable names that are at least this long will be ignored. + # # Default: 3 + # min-name-length: 2 + # # Check method receivers. + # # Default: false + # check-receiver: true + # # Check named return values. + # # Default: false + # check-return: true + # # Check type parameters. + # # Default: false + # check-type-param: true whitespace: # Enforces newlines (or comments) after every multi-line if statement. # Default: false multi-if: true # Enforces newlines (or comments) after every multi-line function signature. # Default: false - multi-func: true \ No newline at end of file + multi-func: true + +issues: + # Which dirs to exclude: issues from them won't be reported. + # Can use regexp here: `generated.*`, regexp is applied on full path, + # including the path prefix if one is set. + # Default dirs are skipped independently of this option's value (see exclude-dirs-use-default). + # "/" will be replaced by current OS file path separator to properly work on Windows. + # Default: [] + exclude-dirs: [] + # Show issues in any part of update files (requires new-from-rev or new-from-patch). + # Default: false + whole-files: false + # Fix found issues (if it's supported by the linter). + # Default: false + fix: true \ No newline at end of file diff --git a/.vscode/extenstions.json b/.vscode/extenstions.json new file mode 100644 index 0000000..7203cb3 --- /dev/null +++ b/.vscode/extenstions.json @@ -0,0 +1,5 @@ +{ + "recommendations": [ + "golang.go" + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1a653cd --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,6 @@ +{ + "go.lintTool": "golangci-lint", + "go.lintFlags": [ + "--fast" + ] +} \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d73b5c1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,67 @@ +# Use the official Golang image version 1.23 with the Alpine distribution as the base image for the build stage. +# This multi-stage build starts with the "build-stage" stage where the Go application will be compiled. +FROM golang:1.23.1-alpine3.20 AS build-stage + +# Perform system updates and install necessary packages. +# - `apk --no-cache update`: Updates the Alpine package repository without caching index files. +# - `apk --no-cache upgrade`: Upgrades all installed packages to the latest available versions. +# - `apk --no-cache add`: Installs additional required packages: +# - `ca-certificates`: For managing CA certificates for secure communication. +# - `curl`: For making HTTP requests (can be used to download files or for health checks). +# - `gcc` and `g++`: The GNU Compiler Collection used for compiling C and C++ code, essential for building Go applications. +# - `git`: Required for downloading Go modules that reference external repositories. +# - `make`: Utility for automating build processes and running the `Makefile`. +RUN <-linux-amd64.tar.gz ``` -> **TIP:** The above steps, download and extract, can be combined into a single step with this onliner +> [!TIP] +> The above steps, download and extract, can be combined into a single step with this onliner > > ```bash > curl -sL https://github.com/hueristiq/xurlfind3r/releases/download/v/xurlfind3r--linux-amd64.tar.gz | tar -xzv > ``` -**NOTE:** On Windows systems, you should be able to double-click the zip archive to extract the `xurlfind3r` executable. +> [!NOTE] +> On Windows systems, you should be able to double-click the zip archive to extract the `xurlfind3r` executable. ...move the `xurlfind3r` binary to somewhere in your `PATH`. For example, on GNU/Linux and OS X systems: @@ -70,7 +70,9 @@ tar xf xurlfind3r--linux-amd64.tar.gz sudo mv xurlfind3r /usr/local/bin/ ``` -**NOTE:** Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xurlfind3r` to their `PATH`. +> [!NOTE] +> Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xurlfind3r` to their `PATH`. + ### Install source (With Go Installed) @@ -103,50 +105,48 @@ go install -v github.com/hueristiq/xurlfind3r/cmd/xurlfind3r@latest sudo mv xurlfind3r /usr/local/bin/ ``` - **NOTE:** Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xurlfind3r` to their `PATH`. + Windows users can follow [How to: Add Tool Locations to the PATH Environment Variable](https://msdn.microsoft.com/en-us/library/office/ee537574(v=office.14).aspx) in order to add `xurlfind3r` to their `PATH`. + + +> [!CAUTION] +> While the development version is a good way to take a peek at `xurlfind3r`'s latest features before they get released, be aware that it may have bugs. Officially released versions will generally be more stable. + +### Install on Docker (With Docker Installed) + +To install `xurlfind3r` on docker: + +* Pull the docker image using: + ```bash + docker pull hueristiq/xurlfind3r:latest + ``` -**NOTE:** While the development version is a good way to take a peek at `xurlfind3r`'s latest features before they get released, be aware that it may have bugs. Officially released versions will generally be more stable. +* Run `xurlfind3r` using the image: + + ```bash + docker run --rm hueristiq/xurlfind3r:latest -h + ``` ## Post Installation -`xurlfind3r` will work right after [installation](#installation). However, **[BeVigil](https://bevigil.com)**, **[Github](https://github.com)** and **[Intelligence X](https://intelx.io)** require API keys to work, **[URLScan](https://urlscan.io)** supports API key but not required. The API keys are stored in the `$HOME/.config/xurlfind3r/config.yaml` file - created upon first run - and uses the YAML format. Multiple API keys can be specified for each of these source from which one of them will be used. - -Example `config.yaml`: - -> **NOTE:** The keys/tokens below are invalid, use your own keys/tokens! - -```yaml -version: 0.4.0 -sources: - - bevigil - - commoncrawl - - github - - intelx - - otx - - urlscan - - wayback -keys: - bevigil: - - awA5nvpKU3N8ygkZ - github: - - d23a554bbc1aabb208c9acfbd2dd41ce7fc9db39 - - asdsd54bbc1aabb208c9acfbd2dd41ce7fc9db39 - intelx: - - 2.intelx.io:00000000-0000-0000-0000-000000000000 - urlscan: - - d4c85d34-e425-446e-d4ab-f5a3412acbe8 +`xurlfind3r` will work right after [installation](#installation). However, some sources require API keys to work. These keys can be added to a configuration file at `$HOME/.config/xurlfind3r/config.yaml`, created upon first run, or set as environment variables. + +Example of environment variables for API keys: + +```bash +XURLFIND3R_KEYS_BEVIGIL=your_bevigil_key +XURLFIND3R_KEYS_ONTELX=your_intelx_key ``` ## Usage -To display help message for `xurlfind3r` use the `-h` flag: +To start using `xurlfind3r`, open your terminal and run the following command for a list of options: ```bash xurlfind3r -h ``` -help message: +Here's what the help message looks like: ``` @@ -157,20 +157,18 @@ __ ___ _ _ __| |/ _(_)_ __ __| |___ / _ __ /_/\_\\__,_|_| |_|_| |_|_| |_|\__,_|____/|_| v0.4.0 - with <3 by Hueristiq Open Source - USAGE: - xurlfind3r [OPTIONS] + xurlfind3r [OPTIONS] CONFIGURATION: - -c, --configuration string configuration file path (default: $HOME/.config/xurlfind3r/config.yaml) + -c, --configuration string configuration file (default: $HOME/.cfg/xurlfind3r/config.yaml) INPUT: -d, --domain string[] target domain -l, --list string target domains' list file path - TIP: For multiple input domains use comma(,) separated value with `-d`, - specify multiple `-d`, load from file with `-l` or load from stdin. +TIP: For multiple input domains use comma(,) separated value with `-d`, + specify multiple `-d`, load from file with `-l` or load from stdin. SCOPE: --include-subdomains bool match subdomain's URLs @@ -179,8 +177,6 @@ SOURCES: --sources bool list supported sources -u, --use-sources string[] comma(,) separated sources to use -e, --exclude-sources string[] comma(,) separated sources to exclude - --parse-wayback-robots bool with wayback, parse robots.txt snapshots - --parse-wayback-source bool with wayback, parse source code snapshots FILTER & MATCH: -f, --filter string regex to filter URLs @@ -192,6 +188,8 @@ OUTPUT: -O, --output-directory string output URLs directory path -s, --silent bool display output subdomains only -v, --verbose bool display verbose output + +pflag: help requested ``` ### Examples @@ -218,22 +216,22 @@ xurlfind3r -d hackerone.com --include-subdomains -m '^https?://[^/]*?/.*\.js(\?[ ## Contributing -[Issues](https://github.com/hueristiq/xurlfind3r/issues) and [Pull Requests](https://github.com/hueristiq/xurlfind3r/pulls) are welcome! **Check out the [contribution guidelines](https://github.com/hueristiq/xurlfind3r/blob/master/CONTRIBUTING.md).** +We welcome contributions! Feel free to submit [Pull Requests](https://github.com/hueristiq/xurlfind3r/pulls) or report [Issues](https://github.com/hueristiq/xurlfind3r/issues). For more details, check out the [contribution guidelines](https://github.com/hueristiq/xurlfind3r/blob/master/CONTRIBUTING.md). ## Licensing -This utility is distributed under the [MIT license](https://github.com/hueristiq/xurlfind3r/blob/master/LICENSE). +This utility is licensed under the [MIT license](https://opensource.org/license/mit). You are free to use, modify, and distribute it, as long as you follow the terms of the license. You can find the full license text in the repository - [Full MIT license text](https://github.com/hueristiq/xurlfind3r/blob/master/LICENSE). ## Credits ### Contributors -Thanks to the amazing [contributors](https://github.com/hueristiq/xurlfind3r/graphs/contributors) for keeping this project alive. +A huge thanks to all the contributors who have helped make `xurlfind3r` what it is today! [![contributors](https://contrib.rocks/image?repo=hueristiq/xurlfind3r&max=500)](https://github.com/hueristiq/xurlfind3r/graphs/contributors) ### Similar Projects -Thanks to similar open source projects - check them out, may fit in your workflow. +If you're interested in more utilities like this, check out: [gau](https://github.com/lc/gau) ◇ [waybackurls](https://github.com/tomnomnom/waybackurls) ◇ [waymore](https://github.com/xnl-h4ck3r/waymore) \ No newline at end of file diff --git a/cmd/xurlfind3r/main.go b/cmd/xurlfind3r/main.go index 478d353..07e0ed0 100644 --- a/cmd/xurlfind3r/main.go +++ b/cmd/xurlfind3r/main.go @@ -3,6 +3,7 @@ package main import ( "bufio" "fmt" + "log" "os" "path/filepath" "reflect" @@ -13,10 +14,11 @@ import ( "github.com/hueristiq/hqgolog/formatter" "github.com/hueristiq/hqgolog/levels" "github.com/hueristiq/xurlfind3r/internal/configuration" - "github.com/hueristiq/xurlfind3r/pkg/scraper" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" "github.com/logrusorgru/aurora/v3" "github.com/spf13/pflag" + "github.com/spf13/viper" ) var ( @@ -29,8 +31,6 @@ var ( listSources bool sourcesToUse []string sourcesToExclude []string - parseWaybackRobots bool - parseWaybackSource bool filterPattern string matchPattern string monochrome bool @@ -49,13 +49,11 @@ func init() { pflag.BoolVar(&listSources, "sources", false, "") pflag.StringSliceVarP(&sourcesToUse, "use-sources", "u", []string{}, "") pflag.StringSliceVarP(&sourcesToExclude, "exclude-sources", "e", []string{}, "") - pflag.BoolVar(&parseWaybackRobots, "parse-wayback-robots", false, "") - pflag.BoolVar(&parseWaybackSource, "parse-wayback-source", false, "") pflag.StringVarP(&filterPattern, "filter", "f", "", "") pflag.StringVarP(&matchPattern, "match", "m", "", "") pflag.BoolVar(&monochrome, "no-color", false, "") pflag.StringVarP(&output, "output", "o", "", "") - pflag.StringVarP(&outputDirectory, "outputDirectory", "O", "", "") + pflag.StringVarP(&outputDirectory, "output-directory", "O", "", "") pflag.BoolVarP(&silent, "silent", "s", false, "") pflag.BoolVarP(&verbose, "verbose", "v", false, "") @@ -67,15 +65,15 @@ func init() { h += fmt.Sprintf(" %s [OPTIONS]\n", configuration.NAME) h += "\nCONFIGURATION:\n" - defaultConfigurationFilePath := strings.ReplaceAll(configuration.ConfigurationFilePath, configuration.UserDotConfigDirectoryPath, "$HOME/.config") + defaultConfigurationFilePath := strings.ReplaceAll(configuration.ConfigurationFilePath, configuration.UserDotConfigDirectoryPath, "$HOME/.cfg") h += fmt.Sprintf(" -c, --configuration string configuration file (default: %s)\n", defaultConfigurationFilePath) h += "\nINPUT:\n" h += " -d, --domain string[] target domain\n" h += " -l, --list string target domains' list file path\n" - h += "\n TIP: For multiple input domains use comma(,) separated value with `-d`,\n" - h += " specify multiple `-d`, load from file with `-l` or load from stdin.\n" + h += "\nTIP: For multiple input domains use comma(,) separated value with `-d`,\n" + h += " specify multiple `-d`, load from file with `-l` or load from stdin.\n" h += "\nSCOPE:\n" h += " --include-subdomains bool match subdomain's URLs\n" @@ -84,8 +82,6 @@ func init() { h += " --sources bool list supported sources\n" h += " -u, --use-sources string[] comma(,) separated sources to use\n" h += " -e, --exclude-sources string[] comma(,) separated sources to exclude\n" - h += " --parse-wayback-robots bool with wayback, parse robots.txt snapshots\n" - h += " --parse-wayback-source bool with wayback, parse source code snapshots\n" h += "\nFILTER & MATCH:\n" h += " -f, --filter string regex to filter URLs\n" @@ -103,6 +99,20 @@ func init() { pflag.Parse() + // Initialize configuration management (...with viper) + if err := configuration.CreateUpdate(configurationFilePath); err != nil { + hqgolog.Fatal().Msg(err.Error()) + } + + viper.SetConfigFile(configurationFilePath) + viper.AutomaticEnv() + viper.SetEnvPrefix("XSUBFIND3R") + viper.SetEnvKeyReplacer(strings.NewReplacer(".", "_")) + + if err := viper.ReadInConfig(); err != nil { + log.Fatalln(err) + } + // Initialize logger (hqgolog) hqgolog.DefaultLogger.SetMaxLevel(levels.LevelInfo) @@ -114,11 +124,6 @@ func init() { Colorize: !monochrome, })) - // Create or Update configuration - if err := configuration.CreateUpdate(configurationFilePath); err != nil { - hqgolog.Fatal().Msg(err.Error()) - } - au = aurora.NewAurora(!monochrome) } @@ -130,33 +135,31 @@ func main() { var err error - var config configuration.Configuration + var cfg *configuration.Configuration - // read in configuration. - config, err = configuration.Read(configurationFilePath) - if err != nil { + if err = viper.Unmarshal(&cfg); err != nil { hqgolog.Fatal().Msg(err.Error()) } // if --sources: List suported sources & exit. if listSources { hqgolog.Print().Msg("") - hqgolog.Info().Msgf("listing, %v, current supported sources.", au.Underline(strconv.Itoa(len(config.Sources))).Bold()) + hqgolog.Info().Msgf("listing, %v, current supported sources.", au.Underline(strconv.Itoa(len(cfg.Sources))).Bold()) hqgolog.Info().Msgf("sources marked with %v take in key(s) or token(s).", au.Underline("*").Bold()) hqgolog.Print().Msg("") needsKey := make(map[string]interface{}) - keysElem := reflect.ValueOf(&config.Keys).Elem() + keysElem := reflect.ValueOf(&cfg.Keys).Elem() - for i := 0; i < keysElem.NumField(); i++ { + for i := range keysElem.NumField() { needsKey[strings.ToLower(keysElem.Type().Field(i).Name)] = keysElem.Field(i).Interface() } - for i, source := range config.Sources { + for _, source := range cfg.Sources { if _, ok := needsKey[source]; ok { - hqgolog.Print().Msgf("%d. %s *", i+1, source) + hqgolog.Print().Msgf("> %s *", source) } else { - hqgolog.Print().Msgf("%d. %s", i+1, source) + hqgolog.Print().Msgf("> %s", source) } } @@ -171,9 +174,7 @@ func main() { file, err = os.Open(domainsListFilePath) if err != nil { - hqgolog.Error().Msg(err.Error()) - - return + hqgolog.Fatal().Msg(err.Error()) } scanner := bufio.NewScanner(file) @@ -187,9 +188,7 @@ func main() { } if err = scanner.Err(); err != nil { - hqgolog.Error().Msg(err.Error()) - - return + hqgolog.Fatal().Msg(err.Error()) } } @@ -206,12 +205,24 @@ func main() { } if err = scanner.Err(); err != nil { - hqgolog.Error().Msg(err.Error()) - - return + hqgolog.Fatal().Msg(err.Error()) } } + var finder *xurlfind3r.Finder + + finder, err = xurlfind3r.New(&xurlfind3r.Configuration{ + IncludeSubdomains: includeSubdomains, + SourcesToUse: sourcesToUse, + SourcesToExclude: sourcesToExclude, + Keys: cfg.Keys, + FilterPattern: filterPattern, + MatchPattern: matchPattern, + }) + if err != nil { + hqgolog.Fatal().Msg(err.Error()) + } + // scrape and output URLs. var consolidatedWriter *bufio.Writer @@ -236,55 +247,31 @@ func main() { mkdir(outputDirectory) } - options := &scraper.Options{ - IncludeSubdomains: includeSubdomains, - SourcesToUSe: sourcesToUse, - SourcesToExclude: sourcesToExclude, - Keys: config.Keys, - ParseWaybackRobots: parseWaybackRobots, - ParseWaybackSource: parseWaybackSource, - FilterPattern: filterPattern, - Matchattern: matchPattern, - } - - var spr *scraper.Finder - - spr, err = scraper.New(options) - if err != nil { - hqgolog.Error().Msg(err.Error()) - - return - } - - for index := range domains { - domain := domains[index] - + for _, domain := range domains { if !silent { hqgolog.Print().Msg("") hqgolog.Info().Msgf("Finding URLs for %v...", au.Underline(domain).Bold()) hqgolog.Print().Msg("") } - URLs := spr.Scrape(domain) + results := finder.Find(domain) switch { case output != "": - outputURLs(consolidatedWriter, URLs) + outputURLs(consolidatedWriter, results) case outputDirectory != "": var domainFile *os.File domainFile, err = os.OpenFile(filepath.Join(outputDirectory, domain+".txt"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) if err != nil { - hqgolog.Error().Msg(err.Error()) - - return + hqgolog.Fatal().Msg(err.Error()) } domainWriter := bufio.NewWriter(domainFile) - outputURLs(domainWriter, URLs) + outputURLs(domainWriter, results) default: - outputURLs(nil, URLs) + outputURLs(nil, results) } } } @@ -312,11 +299,11 @@ func mkdir(path string) { func outputURLs(writer *bufio.Writer, URLs chan sources.Result) { for URL := range URLs { switch URL.Type { - case sources.Error: + case sources.ResultError: if verbose { hqgolog.Error().Msgf("%s: %s\n", URL.Source, URL.Error) } - case sources.URL: + case sources.ResultURL: if verbose { hqgolog.Print().Msgf("[%s] %s", au.BrightBlue(URL.Source), URL.Value) } else { diff --git a/go.mod b/go.mod index 11c5778..ac158ff 100644 --- a/go.mod +++ b/go.mod @@ -1,25 +1,43 @@ module github.com/hueristiq/xurlfind3r -go 1.20 +go 1.23.1 require ( - dario.cat/mergo v1.0.0 - github.com/hueristiq/hqgohttp v0.0.0-20231024010818-fdb48fa4aead - github.com/hueristiq/hqgolimit v0.0.0-20230623113203-3e14552a97f8 + dario.cat/mergo v1.0.1 + github.com/hueristiq/hq-go-http v0.0.0-20241014121239-62e79a5b0581 + github.com/hueristiq/hq-go-limiter v0.0.0-20241014121435-ad5ad6a707cd + github.com/hueristiq/hq-go-url v0.0.0-20241014121328-cbb5439a9021 github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f - github.com/hueristiq/hqgourl v0.0.0-20230821112831-e12f907b5a53 github.com/logrusorgru/aurora/v3 v3.0.0 - github.com/spf13/cast v1.6.0 + github.com/spf13/cast v1.7.0 github.com/spf13/pflag v1.0.5 + github.com/spf13/viper v1.19.0 + github.com/stretchr/testify v1.9.0 github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80 gopkg.in/yaml.v3 v3.0.1 ) require ( github.com/Mzack9999/go-http-digest-auth-client v0.6.0 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/fsnotify/fsnotify v1.7.0 // indirect + github.com/hashicorp/hcl v1.0.0 // indirect + github.com/hueristiq/hq-go-retrier v0.0.0-20241014121125-09fff4c010e8 // indirect github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440 // indirect - golang.org/x/net v0.18.0 // indirect - golang.org/x/sys v0.14.0 // indirect - golang.org/x/term v0.14.0 // indirect - golang.org/x/text v0.14.0 // indirect + github.com/magiconair/properties v1.8.7 // indirect + github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/pelletier/go-toml/v2 v2.2.3 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect + github.com/sagikazarmark/locafero v0.6.0 // indirect + github.com/sagikazarmark/slog-shim v0.1.0 // indirect + github.com/sourcegraph/conc v0.3.0 // indirect + github.com/spf13/afero v1.11.0 // indirect + github.com/subosito/gotenv v1.6.0 // indirect + go.uber.org/multierr v1.11.0 // indirect + golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c // indirect + golang.org/x/net v0.30.0 // indirect + golang.org/x/sys v0.26.0 // indirect + golang.org/x/term v0.25.0 // indirect + golang.org/x/text v0.19.0 // indirect + gopkg.in/ini.v1 v1.67.0 // indirect ) diff --git a/go.sum b/go.sum index d440eb1..9671c89 100644 --- a/go.sum +++ b/go.sum @@ -1,42 +1,84 @@ -dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= -dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= +dario.cat/mergo v1.0.1 h1:Ra4+bf83h2ztPIQYNP99R6m+Y7KfnARDfID+a+vLl4s= +dario.cat/mergo v1.0.1/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= github.com/Mzack9999/go-http-digest-auth-client v0.6.0 h1:LXVNMsj7qiNVmlZByFbjJmXf6SOm/uoo04XmnNcWPms= github.com/Mzack9999/go-http-digest-auth-client v0.6.0/go.mod h1:gbwaYYXwA15ZfIxMyY5QU1acATDyNKEuG5TylBCL7AM= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= -github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= -github.com/hueristiq/hqgohttp v0.0.0-20231024010818-fdb48fa4aead h1:Iep2G2h3hSwc7w0qr1iVVAptgXqjn7DRXVQ33luPmhk= -github.com/hueristiq/hqgohttp v0.0.0-20231024010818-fdb48fa4aead/go.mod h1:Faf/mOhyfNnLIfhoYj2vfPrjt0nKBr4WaU+OQ0C7r6U= -github.com/hueristiq/hqgolimit v0.0.0-20230623113203-3e14552a97f8 h1:r4ze6pX8H//X4SJEIcn8wHPgAhaGKEaa44lyHh1epXY= -github.com/hueristiq/hqgolimit v0.0.0-20230623113203-3e14552a97f8/go.mod h1:CzhJzxz2rv/NMKNz5b4eKFh1epdcED05YTHT32NFyrI= +github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= +github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= +github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= +github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= +github.com/hueristiq/hq-go-http v0.0.0-20241014121239-62e79a5b0581 h1:5NmFYwuPCurGVjz0I/g7q998rp/X4vDPtLgr3pvrlQ0= +github.com/hueristiq/hq-go-http v0.0.0-20241014121239-62e79a5b0581/go.mod h1:HNq1LSRIFndL7YzB17UVatJxsbWQFZkaaJvJv5AYWX0= +github.com/hueristiq/hq-go-limiter v0.0.0-20241014121435-ad5ad6a707cd h1:PpHm0WsXI4CnGI86sJsDtLHdfTJVxOJ7WTlL1ofZUWY= +github.com/hueristiq/hq-go-limiter v0.0.0-20241014121435-ad5ad6a707cd/go.mod h1:n1ODyTZYMyUuOIQCqHZgvobG8zWY/gxQaGeMiSnUnZw= +github.com/hueristiq/hq-go-retrier v0.0.0-20241014121125-09fff4c010e8 h1:QOJMjzKftmhYZojjqe2RDbUQDH/TZSz8gZURgxbGypM= +github.com/hueristiq/hq-go-retrier v0.0.0-20241014121125-09fff4c010e8/go.mod h1:YkxIHoJHsL0wmzQ3tc0qz4UTr9q9eCicUt5RvMV//xw= +github.com/hueristiq/hq-go-url v0.0.0-20241014121328-cbb5439a9021 h1:mBTiyALF+hGfDd6/wi8ihy1P++vHzO0kqXfGch0JK28= +github.com/hueristiq/hq-go-url v0.0.0-20241014121328-cbb5439a9021/go.mod h1:oDQP/s9eYGtsTz7LrbCC9CKbq6TmfB9b7yAoPCIqqyQ= github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f h1:JAgZOIJ+UbkENpRiOTlfg51CW0UNrUkgwLjUGiH+x9g= github.com/hueristiq/hqgolog v0.0.0-20230623113334-a6018965a34f/go.mod h1:S5J3E3Azva5+JKv67uc+Hh3XwLDvkVYDGjEaMTFrIqg= -github.com/hueristiq/hqgourl v0.0.0-20230821112831-e12f907b5a53 h1:6pwdpEJoB1woSToh0cxLh5QirNOAp2z7DzvMKiaqdro= -github.com/hueristiq/hqgourl v0.0.0-20230821112831-e12f907b5a53/go.mod h1:Fc2vfWpIVFWUmCv1S0xVsz3mIPYwdgsa6f2vCgL4CrA= github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440 h1:dpHAa9c74HgAXkZ2WPd84q2cCiF76eluuSGRw7bk7To= github.com/hueristiq/hqgoutils v0.0.0-20231024005153-bd2c47932440/go.mod h1:NlZ117o///yWDbRAbgYD7/Y44qce8z1Dj4caUsjunSY= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/logrusorgru/aurora/v3 v3.0.0 h1:R6zcoZZbvVcGMvDCKo45A9U/lzYyzl5NfYIvznmDfE4= github.com/logrusorgru/aurora/v3 v3.0.0/go.mod h1:vsR12bk5grlLvLXAYrBsb5Oc/N+LxAlxggSjiwMnCUc= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= +github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= +github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= +github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= +github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M= +github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= -github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0= -github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= +github.com/sagikazarmark/locafero v0.6.0 h1:ON7AQg37yzcRPU69mt7gwhFEBwxI6P9T4Qu3N51bwOk= +github.com/sagikazarmark/locafero v0.6.0/go.mod h1:77OmuIc6VTraTXKXIs/uvUxKGUXjE1GbemJYHqdNjX0= +github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= +github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= +github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= +github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= +github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8= +github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY= +github.com/spf13/cast v1.7.0 h1:ntdiHjuueXFgm5nzDRdOS4yfT43P5Fnud6DH50rz/7w= +github.com/spf13/cast v1.7.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI= +github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= +github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80 h1:nrZ3ySNYwJbSpD6ce9duiP+QkD3JuLCcWkdaehUS/3Y= github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80/go.mod h1:iFyPdL66DjUD96XmzVL3ZntbzcflLnznH0fr99w5VqE= -golang.org/x/net v0.18.0 h1:mIYleuAkSbHh0tCv7RvjL3F6ZVbLjq4+R7zbOn3Kokg= -golang.org/x/net v0.18.0/go.mod h1:/czyP5RqHAH4odGYxBJ1qz0+CE5WZ+2j1YgoEo8F2jQ= -golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q= -golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/term v0.14.0 h1:LGK9IlZ8T9jvdy6cTdfKUCltatMFOehAQo9SRC46UQ8= -golang.org/x/term v0.14.0/go.mod h1:TySc+nGkYR6qt8km8wUhuFRTVSMIX3XPR58y2lC8vww= -golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= -golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= +go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= +golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c h1:7dEasQXItcW1xKJ2+gg5VOiBnqWrJc+rq0DPKyvvdbY= +golang.org/x/exp v0.0.0-20241009180824-f66d83c29e7c/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8= +golang.org/x/net v0.30.0 h1:AcW1SDZMkb8IpzCdQUaIq2sP4sZ4zw+55h6ynffypl4= +golang.org/x/net v0.30.0/go.mod h1:2wGyMJ5iFasEhkwi13ChkO/t1ECNC4X4eBKkVFyYFlU= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.25.0 h1:WtHI/ltw4NvSUig5KARz9h521QvRC8RmF/cuYqifU24= +golang.org/x/term v0.25.0/go.mod h1:RPyXicDX+6vLxogjjRxjgD2TKtmAO6NZBsBRfrOLu7M= +golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= +golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= +gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/configuration/configuration.go b/internal/configuration/configuration.go index 5b6e4bc..39f22bb 100644 --- a/internal/configuration/configuration.go +++ b/internal/configuration/configuration.go @@ -6,7 +6,7 @@ import ( "dario.cat/mergo" "github.com/hueristiq/hqgolog" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" "github.com/logrusorgru/aurora/v3" "gopkg.in/yaml.v3" ) @@ -46,8 +46,8 @@ func (configuration *Configuration) Write(path string) (err error) { } const ( - NAME string = "xurlfind3r" - VERSION string = "0.4.0" + NAME = "xurlfind3r" + VERSION = "0.4.0" ) var ( @@ -58,11 +58,8 @@ __ ___ _ _ __| |/ _(_)_ __ __| |___ / _ __ \ \/ / | | | '__| | |_| | '_ \ / _`+"`"+` | |_ \| '__| > <| |_| | | | | _| | | | | (_| |___) | | /_/\_\\__,_|_| |_|_| |_|_| |_|\__,_|____/|_| - %s - - %s`).Bold(), + %s`).Bold(), aurora.BrightRed("v"+VERSION).Bold(), - aurora.BrightYellow("with <3 by Hueristiq Open Source").Italic(), ) UserDotConfigDirectoryPath = func() (userDotConfig string) { var err error @@ -88,8 +85,8 @@ func CreateUpdate(path string) (err error) { Sources: sources.List, Keys: sources.Keys{ Bevigil: []string{}, - GitHub: []string{}, - Intelx: []string{}, + Github: []string{}, + IntelX: []string{}, URLScan: []string{}, }, } diff --git a/pkg/httpclient/client.go b/pkg/httpclient/client.go index 5df268c..e60b26d 100644 --- a/pkg/httpclient/client.go +++ b/pkg/httpclient/client.go @@ -5,19 +5,21 @@ import ( "io" "net/http" "net/url" + "time" - "github.com/hueristiq/hqgohttp" - "github.com/hueristiq/hqgohttp/methods" - "github.com/hueristiq/hqgohttp/status" + hqgohttp "github.com/hueristiq/hq-go-http" + "github.com/hueristiq/hq-go-http/methods" + "github.com/hueristiq/hq-go-http/status" "github.com/hueristiq/xurlfind3r/internal/configuration" ) var client *hqgohttp.Client func init() { - options := hqgohttp.DefaultOptionsSpraying + cfg := hqgohttp.DefaultSprayingClientConfiguration + cfg.Timeout = 1 * time.Hour - client, _ = hqgohttp.New(options) + client, _ = hqgohttp.NewClient(cfg) } func httpRequestWrapper(req *hqgohttp.Request) (res *http.Response, err error) { @@ -47,6 +49,7 @@ func HTTPRequest(method, requestURL, cookies string, headers map[string]string, req.Header.Set("Accept", "*/*") req.Header.Set("Accept-Language", "en") req.Header.Set("User-Agent", fmt.Sprintf("%s v%s (https://github.com/hueristiq/%s)", configuration.NAME, configuration.VERSION, configuration.NAME)) + // req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:130.0) Gecko/20100101 Firefox/130.0") if cookies != "" { req.Header.Set("Cookie", cookies) diff --git a/pkg/scraper/scraper.go b/pkg/scraper/scraper.go deleted file mode 100644 index ffb04f2..0000000 --- a/pkg/scraper/scraper.go +++ /dev/null @@ -1,135 +0,0 @@ -package scraper - -import ( - "regexp" - "sync" - - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources/bevigil" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources/commoncrawl" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources/github" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources/intelx" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources/otx" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources/urlscan" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources/wayback" -) - -type Options struct { - IncludeSubdomains bool - SourcesToUSe []string - SourcesToExclude []string - Keys sources.Keys - ParseWaybackRobots bool - ParseWaybackSource bool - FilterPattern string - Matchattern string -} - -type Finder struct { - Sources map[string]sources.Source - SourcesConfiguration *sources.Configuration - FilterRegex *regexp.Regexp - MatchRegex *regexp.Regexp -} - -func (finder *Finder) Scrape(domain string) (results chan sources.Result) { - results = make(chan sources.Result) - - go func() { - defer close(results) - - seenURLs := &sync.Map{} - - wg := &sync.WaitGroup{} - - for name := range finder.Sources { - wg.Add(1) - - go func(source sources.Source) { - defer wg.Done() - - sResults := source.Run(finder.SourcesConfiguration, domain) - - for sResult := range sResults { - if sResult.Type == sources.URL { - _, loaded := seenURLs.LoadOrStore(sResult.Value, struct{}{}) - if loaded { - continue - } - - if (finder.MatchRegex != nil && !finder.MatchRegex.MatchString(sResult.Value)) || (finder.FilterRegex != nil && finder.MatchRegex == nil && finder.FilterRegex.MatchString(sResult.Value)) { - continue - } - } - - results <- sResult - } - }(finder.Sources[name]) - } - - wg.Wait() - }() - - return -} - -func New(options *Options) (finder *Finder, err error) { - finder = &Finder{ - Sources: map[string]sources.Source{}, - SourcesConfiguration: &sources.Configuration{ - IncludeSubdomains: options.IncludeSubdomains, - Keys: options.Keys, - ParseWaybackRobots: options.ParseWaybackRobots, - ParseWaybackSource: options.ParseWaybackSource, - }, - } - - if options.FilterPattern != "" { - finder.FilterRegex, err = regexp.Compile(options.FilterPattern) - if err != nil { - return - } - } - - if options.Matchattern != "" { - finder.MatchRegex, err = regexp.Compile(options.Matchattern) - if err != nil { - return - } - } - - // Sources To Use - if len(options.SourcesToUSe) < 1 { - options.SourcesToUSe = sources.List - } - - for index := range options.SourcesToUSe { - source := options.SourcesToUSe[index] - - switch source { - case "bevigil": - finder.Sources[source] = &bevigil.Source{} - case "commoncrawl": - finder.Sources[source] = &commoncrawl.Source{} - case "github": - finder.Sources[source] = &github.Source{} - case "intelx": - finder.Sources[source] = &intelx.Source{} - case "otx": - finder.Sources[source] = &otx.Source{} - case "urlscan": - finder.Sources[source] = &urlscan.Source{} - case "wayback": - finder.Sources[source] = &wayback.Source{} - } - } - - // Sources To Exclude - for index := range options.SourcesToExclude { - source := options.SourcesToExclude[index] - - delete(finder.Sources, source) - } - - return -} diff --git a/pkg/scraper/sources/sources.go b/pkg/scraper/sources/sources.go deleted file mode 100644 index b91cdec..0000000 --- a/pkg/scraper/sources/sources.go +++ /dev/null @@ -1,50 +0,0 @@ -package sources - -type Source interface { - // Run takes in configuration which includes keys/tokens and other stuff, - // and domain as arguments. - Run(config *Configuration, domain string) <-chan Result - // Name returns the name of the source. - Name() string -} - -type Configuration struct { - IncludeSubdomains bool - Keys Keys - ParseWaybackRobots bool - ParseWaybackSource bool -} - -type Keys struct { - Bevigil []string `yaml:"bevigil"` - GitHub []string `yaml:"github"` - Intelx []string `yaml:"intelx"` - URLScan []string `yaml:"urlscan"` -} - -// Result is a result structure returned by a source. -type Result struct { - Type ResultType - Source string - Value string - Error error -} - -// ResultType is the type of result returned by the source. -type ResultType int - -// Types of results returned by the source. -const ( - URL ResultType = iota - Error -) - -var List = []string{ - "bevigil", - "commoncrawl", - "github", - "intelx", - "otx", - "urlscan", - "wayback", -} diff --git a/pkg/scraper/sources/wayback/wayback.go b/pkg/scraper/sources/wayback/wayback.go deleted file mode 100644 index 998b7b9..0000000 --- a/pkg/scraper/sources/wayback/wayback.go +++ /dev/null @@ -1,243 +0,0 @@ -package wayback - -import ( - "encoding/json" - "fmt" - "net/http" - "regexp" - "strings" - - "github.com/hueristiq/hqgohttp/headers" - "github.com/hueristiq/hqgolimit" - "github.com/hueristiq/xurlfind3r/pkg/httpclient" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" - "github.com/spf13/cast" -) - -type Source struct{} - -var limiter = hqgolimit.New(&hqgolimit.Options{ - RequestsPerMinute: 40, -}) - -func (source *Source) Run(config *sources.Configuration, domain string) <-chan sources.Result { - results := make(chan sources.Result) - - go func() { - defer close(results) - - var err error - - getPagesReqURL := formatURL(domain, config.IncludeSubdomains) + "&showNumPages=true" - - limiter.Wait() - - var getPagesRes *http.Response - - getPagesRes, err = httpclient.SimpleGet(getPagesReqURL) - if err != nil { - result := sources.Result{ - Type: sources.Error, - Source: source.Name(), - Error: err, - } - - results <- result - - httpclient.DiscardResponse(getPagesRes) - - return - } - - var pages uint - - if err = json.NewDecoder(getPagesRes.Body).Decode(&pages); err != nil { - result := sources.Result{ - Type: sources.Error, - Source: source.Name(), - Error: err, - } - - results <- result - - getPagesRes.Body.Close() - - return - } - - getPagesRes.Body.Close() - - waybackURLs := [][]string{} - - for page := uint(0); page < pages; page++ { - getURLsReqURL := fmt.Sprintf("%s&page=%d", formatURL(domain, config.IncludeSubdomains), page) - - limiter.Wait() - - var getURLsRes *http.Response - - getURLsRes, err = httpclient.SimpleGet(getURLsReqURL) - if err != nil { - result := sources.Result{ - Type: sources.Error, - Source: source.Name(), - Error: err, - } - - results <- result - - httpclient.DiscardResponse(getURLsRes) - - return - } - - var getURLsResData [][]string - - if err = json.NewDecoder(getURLsRes.Body).Decode(&getURLsResData); err != nil { - result := sources.Result{ - Type: sources.Error, - Source: source.Name(), - Error: err, - } - - results <- result - - getURLsRes.Body.Close() - - return - } - - getURLsRes.Body.Close() - - // check if there's results, wayback's pagination response - // is not always correct when using a filter - if len(getURLsResData) == 0 { - break - } - - // output results - // Slicing as [1:] to skip first result by default - waybackURLs = append(waybackURLs, getURLsResData[1:]...) - } - - mediaURLRegex := regexp.MustCompile(`(?i)\.(apng|bpm|png|bmp|gif|heif|ico|cur|jpg|jpeg|jfif|pjp|pjpeg|psd|raw|svg|tif|tiff|webp|xbm|3gp|aac|flac|mpg|mpeg|mp3|mp4|m4a|m4v|m4p|oga|ogg|ogv|mov|wav|webm|eot|woff|woff2|ttf|otf|pdf)(?:\?|#|$)`) - robotsURLsRegex := regexp.MustCompile(`^(https?)://[^ "]+/robots.txt$`) - - for _, waybackURL := range waybackURLs { - URL := waybackURL[1] - - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - return - } - - result := sources.Result{ - Type: sources.URL, - Source: source.Name(), - Value: URL, - } - - results <- result - - if mediaURLRegex.MatchString(URL) { - return - } - - if config.ParseWaybackRobots && robotsURLsRegex.MatchString(URL) { - parseWaybackRobots(config, URL, results) - - return - } - - if config.ParseWaybackSource { - parseWaybackSource(config, domain, URL, results) - } - } - }() - - return results -} - -func formatURL(domain string, includeSubdomains bool) (URL string) { - if includeSubdomains { - domain = "*." + domain - } - - URL = fmt.Sprintf("http://web.archive.org/cdx/search/cdx?url=%s/*&output=json&collapse=urlkey&fl=timestamp,original,mimetype,statuscode,digest", domain) - - return -} - -func getSnapshots(URL string) (snapshots [][2]string, err error) { - getSnapshotsReqURL := fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s&output=json&fl=timestamp,original&collapse=digest", URL) - - var getSnapshotsRes *http.Response - - limiter.Wait() - - getSnapshotsRes, err = httpclient.SimpleGet(getSnapshotsReqURL) - if err != nil { - return - } - - if cast.ToInt(getSnapshotsRes.Header.Get(headers.ContentLength)) == 0 { - return - } - - if err = json.NewDecoder(getSnapshotsRes.Body).Decode(&snapshots); err != nil { - getSnapshotsRes.Body.Close() - - return - } - - getSnapshotsRes.Body.Close() - - if len(snapshots) < 2 { - return - } - - snapshots = snapshots[1:] - - return -} - -func getSnapshotContent(snapshot [2]string) (content string, err error) { - var ( - timestamp = snapshot[0] - URL = snapshot[1] - ) - - getSnapshotContentReqURL := fmt.Sprintf("https://web.archive.org/web/%sif_/%s", timestamp, URL) - - limiter.Wait() - - var getSnapshotContentRes *http.Response - - getSnapshotContentRes, err = httpclient.SimpleGet(getSnapshotContentReqURL) - if err != nil { - return - } - - content = cast.ToString(getSnapshotContentRes.Body) - - if content == "" { - getSnapshotContentRes.Body.Close() - - return - } - - getSnapshotContentRes.Body.Close() - - snapshotNotFoundFingerprint := "This page can't be displayed. Please use the correct URL address to access" - - if strings.Contains(content, snapshotNotFoundFingerprint) { - err = fmt.Errorf("%s", snapshotNotFoundFingerprint) - - return - } - - return -} - -func (source *Source) Name() string { - return "wayback" -} diff --git a/pkg/scraper/sources/wayback/waybackrobots.go b/pkg/scraper/sources/wayback/waybackrobots.go deleted file mode 100644 index d615b8b..0000000 --- a/pkg/scraper/sources/wayback/waybackrobots.go +++ /dev/null @@ -1,112 +0,0 @@ -package wayback - -import ( - "path/filepath" - "regexp" - "strings" - "sync" - - "github.com/hueristiq/hqgourl" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" -) - -func parseWaybackRobots(_ *sources.Configuration, URL string, results chan sources.Result) { - robotsEntryRegex := regexp.MustCompile(`(Allow|Disallow):\s?.+`) - - snapshots, err := getSnapshots(URL) - if err != nil { - result := sources.Result{ - Type: sources.Error, - Source: "wayback:robots", - Error: err, - } - - results <- result - - return - } - - wg := &sync.WaitGroup{} - - for _, row := range snapshots { - wg.Add(1) - - go func(row [2]string) { - defer wg.Done() - - content, err := getSnapshotContent(row) - if err != nil { - result := sources.Result{ - Type: sources.Error, - Source: "wayback:robots", - Error: err, - } - - results <- result - - return - } - - matches := robotsEntryRegex.FindAllStringSubmatch(content, -1) - - if len(matches) < 1 { - return - } - - for _, match := range matches { - entry := match[0] - - temp := strings.Split(entry, ": ") - - if len(temp) <= 1 { - continue - } - - robotsURL := temp[1] - - if robotsURL == "/" || robotsURL == "*" || robotsURL == "" { - continue - } - - robotsURL = strings.ReplaceAll(robotsURL, "*", "") - - for strings.HasPrefix(robotsURL, "/") { - if len(robotsURL) >= 1 { - robotsURL = robotsURL[1:] // Ex. /*/test or /*/*/demo - } - } - - for strings.HasSuffix(robotsURL, "/") { - if len(robotsURL) >= 1 { - robotsURL = robotsURL[0 : len(robotsURL)-1] - } - } - - parsedURL, err := hqgourl.Parse(URL) - if err != nil { - result := sources.Result{ - Type: sources.Error, - Source: "wayback:robots", - Error: err, - } - - results <- result - - continue - } - - robotsURL = parsedURL.Scheme + "://" + filepath.Join(parsedURL.Domain, robotsURL) - - result := sources.Result{ - Type: sources.URL, - Source: "wayback:robots", - Value: robotsURL, - } - - results <- result - } - }(row) - } - - wg.Wait() -} diff --git a/pkg/scraper/sources/wayback/waybacksource.go b/pkg/scraper/sources/wayback/waybacksource.go deleted file mode 100644 index 9757acb..0000000 --- a/pkg/scraper/sources/wayback/waybacksource.go +++ /dev/null @@ -1,187 +0,0 @@ -package wayback - -import ( - "fmt" - "mime" - "regexp" - "strings" - "sync" - - "github.com/hueristiq/hqgourl" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" -) - -func parseWaybackSource(config *sources.Configuration, domain, URL string, results chan sources.Result) { - var err error - - var snapshots [][2]string - - snapshots, err = getSnapshots(URL) - if err != nil { - result := sources.Result{ - Type: sources.Error, - Source: "wayback:source", - Error: err, - } - - results <- result - - return - } - - lxExtractor := hqgourl.Extractor.Relaxed() - - var mdExtractor *regexp.Regexp - - mdExtractor, err = hqgourl.Extractor.ModerateMatchHost(`(\w[a-zA-Z0-9][a-zA-Z0-9-\\.]*\.)?` + regexp.QuoteMeta(domain)) - if err != nil { - result := sources.Result{ - Type: sources.Error, - Source: "wayback:source", - Error: err, - } - - results <- result - - return - } - - regex1 := regexp.MustCompile(`^(//web\.archive\.org/web|https://web\.archive\.org/web|/web)/\d{14}([a-z]{2}_)?/.*`) - regex2 := regexp.MustCompile(`^https?://.*`) - - wg := &sync.WaitGroup{} - - for _, row := range snapshots { - wg.Add(1) - - go func(row [2]string) { - defer wg.Done() - - content, err := getSnapshotContent(row) - if err != nil { - result := sources.Result{ - Type: sources.Error, - Source: "wayback:source", - Error: err, - } - - results <- result - - return - } - - lxURLs := lxExtractor.FindAllString(content, -1) - - for _, lxURL := range lxURLs { - lxURL = sources.FixURL(lxURL) - - // `/web/20230128054726/https://example.com/` - // `//web.archive.org/web/20230128054726/https://example.com/` - // `https://web.archive.org/web/20230128054726/https://example.com/` - // `/web/20040111155853js_/http://example.com/2003/mm_menu.js` - if regex1.MatchString(lxURL) { - URLs := mdExtractor.FindAllString(lxURL, -1) - - for _, URL := range URLs { - // `https://web.archive.org/web/20001110042700/mailto:info@safaricom.co.ke`->safaricom.co.ke - if !strings.HasPrefix(URL, "http") { - continue - } - - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - continue - } - - result := sources.Result{ - Type: sources.URL, - Source: "wayback:source", - Value: URL, - } - - results <- result - } - - continue - } - - // `http://www.safaricom.co.ke/` - // `https://web.archive.org/web/*/http://www.safaricom.co.ke/*` - // `//html5shim.googlecode.com/svn/trunk/html5.js`` - if regex2.MatchString(lxURL) || strings.HasPrefix(lxURL, `//`) { - URLs := mdExtractor.FindAllString(lxURL, -1) - - for _, URL := range URLs { - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { - continue - } - - result := sources.Result{ - Type: sources.URL, - Source: "wayback:source", - Value: URL, - } - - results <- result - } - - continue - } - - // text/javascript - _, _, err := mime.ParseMediaType(lxURL) - if err == nil { - result := sources.Result{ - Type: sources.Error, - Source: "wayback:source", - Error: err, - } - - results <- result - - continue - } - - // `//archive.org/includes/analytics.js?v=c535ca67`` - // `archive.org/components/npm/lit/polyfill-support.js?v=c535ca67` - // `archive.org/components/npm/@webcomponents/webcomponentsjs/webcomponents-bundle.js?v=c535ca67` - // `archive.org/includes/build/js/ia-topnav.min.js?v=c535ca67` - // `archive.org/includes/build/js/archive.min.js?v=c535ca67` - // `archive.org/includes/build/css/archive.min.css?v=c535ca67` - if strings.Contains(lxURL, "archive.org") { - continue - } - - parsedSourceURL, err := hqgourl.Parse(URL) - if err != nil { - result := sources.Result{ - Type: sources.Error, - Source: "wayback:source", - Error: err, - } - - results <- result - - continue - } - - lxURL = strings.TrimLeft(lxURL, "/") - - lxURL = fmt.Sprintf("%s://%s/%s", parsedSourceURL.Scheme, parsedSourceURL.Domain, lxURL) - - if !sources.IsInScope(lxURL, domain, config.IncludeSubdomains) { - continue - } - - result := sources.Result{ - Type: sources.URL, - Source: "wayback:source", - Value: lxURL, - } - - results <- result - } - }(row) - } - - wg.Wait() -} diff --git a/pkg/scraper/sources/bevigil/bevigil.go b/pkg/xurlfind3r/sources/bevigil/bevigil.go similarity index 69% rename from pkg/scraper/sources/bevigil/bevigil.go rename to pkg/xurlfind3r/sources/bevigil/bevigil.go index fd8e75d..a3a787e 100644 --- a/pkg/scraper/sources/bevigil/bevigil.go +++ b/pkg/xurlfind3r/sources/bevigil/bevigil.go @@ -6,7 +6,7 @@ import ( "net/http" "github.com/hueristiq/xurlfind3r/pkg/httpclient" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" ) type getURLsResponse struct { @@ -16,20 +16,16 @@ type getURLsResponse struct { type Source struct{} -func (source *Source) Run(config *sources.Configuration, domain string) <-chan sources.Result { +func (source *Source) Run(cfg *sources.Configuration, domain string) <-chan sources.Result { results := make(chan sources.Result) go func() { defer close(results) - var err error - - var key string - - key, err = sources.PickRandom(config.Keys.Bevigil) - if key == "" || err != nil { + key, err := cfg.Keys.Bevigil.PickRandom() + if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -39,20 +35,17 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s return } - getURLsReqHeaders := map[string]string{} - - if len(config.Keys.Bevigil) > 0 { - getURLsReqHeaders["X-Access-Token"] = key - } + var getURLsRes *http.Response getURLsReqURL := fmt.Sprintf("https://osint.bevigil.com/api/%s/urls/", domain) - - var getURLsRes *http.Response + getURLsReqHeaders := map[string]string{ + "X-Access-Token": key, + } getURLsRes, err = httpclient.Get(getURLsReqURL, "", getURLsReqHeaders) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -68,7 +61,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s if err = json.NewDecoder(getURLsRes.Body).Decode(&getURLsResData); err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -83,12 +76,12 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s getURLsRes.Body.Close() for _, URL := range getURLsResData.URLs { - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { + if !cfg.IsInScope(URL) { continue } result := sources.Result{ - Type: sources.URL, + Type: sources.ResultURL, Source: source.Name(), Value: URL, } @@ -101,5 +94,5 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s } func (source *Source) Name() string { - return "bevigil" + return sources.BEVIGIL } diff --git a/pkg/scraper/sources/commoncrawl/commoncrawl.go b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go similarity index 81% rename from pkg/scraper/sources/commoncrawl/commoncrawl.go rename to pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go index 263d6c7..e219b64 100644 --- a/pkg/scraper/sources/commoncrawl/commoncrawl.go +++ b/pkg/xurlfind3r/sources/commoncrawl/commoncrawl.go @@ -10,7 +10,7 @@ import ( "time" "github.com/hueristiq/xurlfind3r/pkg/httpclient" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" ) type getIndexesResponse []struct { @@ -31,10 +31,10 @@ type getURLsResponse struct { type Source struct{} -func (source *Source) Run(config *sources.Configuration, domain string) <-chan sources.Result { +func (source *Source) Run(cfg *sources.Configuration, domain string) <-chan sources.Result { results := make(chan sources.Result) - if config.IncludeSubdomains { + if cfg.IncludeSubdomains { domain = "*." + domain } @@ -43,14 +43,10 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s getIndexesReqURL := "https://index.commoncrawl.org/collinfo.json" - var err error - - var getIndexesRes *http.Response - - getIndexesRes, err = httpclient.SimpleGet(getIndexesReqURL) + getIndexesRes, err := httpclient.SimpleGet(getIndexesReqURL) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -66,7 +62,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s if err = json.NewDecoder(getIndexesRes.Body).Decode(&getIndexesResData); err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -103,18 +99,17 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s } for _, CCIndexAPI := range searchIndexes { + var getPaginationRes *http.Response + + getPaginationReqURL := fmt.Sprintf("%s?url=%s/*&output=json&fl=url&showNumPages=true", CCIndexAPI, domain) getURLsReqHeaders := map[string]string{ "Host": "index.commoncrawl.org", } - getPaginationReqURL := fmt.Sprintf("%s?url=*.%s/*&output=json&fl=url&showNumPages=true", CCIndexAPI, domain) - - var getPaginationRes *http.Response - getPaginationRes, err = httpclient.SimpleGet(getPaginationReqURL) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -130,7 +125,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s if err = json.NewDecoder(getPaginationRes.Body).Decode(&getPaginationData); err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -149,14 +144,14 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s } for page := uint(0); page < getPaginationData.Pages; page++ { - getURLsReqURL := fmt.Sprintf("%s?url=*.%s/*&output=json&fl=url&page=%d", CCIndexAPI, domain, page) - var getURLsRes *http.Response + getURLsReqURL := fmt.Sprintf("%s?url=%s/*&output=json&fl=url&page=%d", CCIndexAPI, domain, page) + getURLsRes, err = httpclient.Get(getURLsReqURL, "", getURLsReqHeaders) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -180,7 +175,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s if err = json.Unmarshal(scanner.Bytes(), &getURLsResData); err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -192,7 +187,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s if getURLsResData.Error != "" { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: fmt.Errorf("%s", getURLsResData.Error), } @@ -204,12 +199,12 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s URL := getURLsResData.URL - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { + if !cfg.IsInScope(URL) { continue } result := sources.Result{ - Type: sources.URL, + Type: sources.ResultURL, Source: source.Name(), Value: URL, } @@ -219,7 +214,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s if err = scanner.Err(); err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -240,5 +235,5 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s } func (source *Source) Name() string { - return "commoncrawl" + return sources.COMMONCRAWL } diff --git a/pkg/xurlfind3r/sources/configuration.go b/pkg/xurlfind3r/sources/configuration.go new file mode 100644 index 0000000..e1bee4f --- /dev/null +++ b/pkg/xurlfind3r/sources/configuration.go @@ -0,0 +1,66 @@ +package sources + +import ( + "crypto/rand" + "errors" + "fmt" + "math/big" +) + +// Configuration holds the overall settings for different data sources. +// It includes API keys for each source and flags for various parsing options. +type Configuration struct { + IncludeSubdomains bool // Determines whether to include subdomains in the data collection process. + + Keys Keys // Holds API keys for multiple sources. + + IsInScope func(URL string) (isInScope bool) +} + +// Keys holds API keys for different data sources, with each source having a set of API keys. +type Keys struct { + Bevigil SourceKeys `yaml:"bevigil"` + Github SourceKeys `yaml:"github"` + IntelX SourceKeys `yaml:"intelx"` + URLScan SourceKeys `yaml:"urlscan"` +} + +// SourceKeys is a slice of strings representing API keys. Multiple API keys +// are used to allow for rotation or fallbacks when certain keys are unavailable. +type SourceKeys []string + +// PickRandom selects and returns a random API key from the SourceKeys slice. +// If the slice is empty, an error is returned. It uses a cryptographically secure +// random number generator to ensure randomness. +func (k SourceKeys) PickRandom() (key string, err error) { + length := len(k) + + // Return an error if no keys are available + if length == 0 { + err = ErrNoKeys + + return + } + + // Generate a cryptographically secure random number within the range [0, length). + maximum := big.NewInt(int64(length)) + + var indexBig *big.Int + + indexBig, err = rand.Int(rand.Reader, maximum) + if err != nil { + err = fmt.Errorf("failed to generate random index: %w", err) + + return + } + + // Convert the big integer index to a standard int64. + index := indexBig.Int64() + + // Select the API key at the generated index. + key = k[index] + + return +} + +var ErrNoKeys = errors.New("no keys available for the source") diff --git a/pkg/scraper/sources/github/github.go b/pkg/xurlfind3r/sources/github/github.go similarity index 76% rename from pkg/scraper/sources/github/github.go rename to pkg/xurlfind3r/sources/github/github.go index 5d9e477..e72cb3c 100644 --- a/pkg/scraper/sources/github/github.go +++ b/pkg/xurlfind3r/sources/github/github.go @@ -10,11 +10,11 @@ import ( "strings" "time" - "github.com/hueristiq/hqgohttp/headers" - "github.com/hueristiq/hqgohttp/status" - "github.com/hueristiq/hqgourl" + "github.com/hueristiq/hq-go-http/headers" + "github.com/hueristiq/hq-go-http/status" + hqgourl "github.com/hueristiq/hq-go-url" "github.com/hueristiq/xurlfind3r/pkg/httpclient" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" "github.com/spf13/cast" "github.com/tomnomnom/linkheader" ) @@ -32,27 +32,27 @@ type searchResponse struct { type Source struct{} -func (source *Source) Run(config *sources.Configuration, domain string) <-chan sources.Result { +func (source *Source) Run(cfg *sources.Configuration, domain string) <-chan sources.Result { results := make(chan sources.Result) go func() { defer close(results) - if len(config.Keys.GitHub) == 0 { + if len(cfg.Keys.Github) == 0 { return } - tokens := NewTokenManager(config.Keys.GitHub) + tokens := NewTokenManager(cfg.Keys.Github) searchReqURL := fmt.Sprintf("https://api.github.com/search/code?per_page=100&q=%q&sort=created&order=asc", domain) - source.Enumerate(searchReqURL, domain, tokens, results, config) + source.Enumerate(searchReqURL, domain, tokens, results, cfg) }() return results } -func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, results chan sources.Result, config *sources.Configuration) { +func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, results chan sources.Result, cfg *sources.Configuration) { token := tokens.Get() if token.RetryAfter > 0 { @@ -78,7 +78,7 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, res if err != nil && !isForbidden { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -96,14 +96,14 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, res tokens.setCurrentTokenExceeded(retryAfterSeconds) - source.Enumerate(searchReqURL, domain, tokens, results, config) + source.Enumerate(searchReqURL, domain, tokens, results, cfg) } var searchResData searchResponse if err = json.NewDecoder(searchRes.Body).Decode(&searchResData); err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -117,20 +117,9 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, res searchRes.Body.Close() - var mdExtractor *regexp.Regexp - - mdExtractor, err = hqgourl.Extractor.ModerateMatchHost(`(\w[a-zA-Z0-9][a-zA-Z0-9-\\.]*\.)?` + regexp.QuoteMeta(domain)) - if err != nil { - result := sources.Result{ - Type: sources.Error, - Source: source.Name(), - Error: err, - } - - results <- result - - return - } + mdExtractor := hqgourl.NewExtractor( + hqgourl.ExtractorWithHostPattern(`(?:(?:\w+[.])*` + regexp.QuoteMeta(domain) + hqgourl.ExtractorPortOptionalPattern + `)`), + ).CompileRegex() for _, item := range searchResData.Items { getRawContentReqURL := getRawContentURL(item.HTMLURL) @@ -140,7 +129,7 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, res getRawContentRes, err = httpclient.SimpleGet(getRawContentReqURL) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -173,10 +162,10 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, res var parsedURL *hqgourl.URL - parsedURL, err = hqgourl.Parse(URL) + parsedURL, err = up.Parse(URL) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -188,12 +177,12 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, res URL = parsedURL.String() - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { + if !cfg.IsInScope(URL) { continue } result := sources.Result{ - Type: sources.URL, + Type: sources.ResultURL, Source: source.Name(), Value: URL, } @@ -204,7 +193,7 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, res if err = scanner.Err(); err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -224,10 +213,10 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, res for _, URL := range URLs { URL = sources.FixURL(URL) - parsedURL, err := hqgourl.Parse(URL) + parsedURL, err := up.Parse(URL) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -239,12 +228,12 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, res URL = parsedURL.String() - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { + if !cfg.IsInScope(URL) { continue } result := sources.Result{ - Type: sources.URL, + Type: sources.ResultURL, Source: source.Name(), Value: URL, } @@ -261,7 +250,7 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, res nextURL, err := url.QueryUnescape(link.URL) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -271,17 +260,19 @@ func (source *Source) Enumerate(searchReqURL, domain string, tokens *Tokens, res return } - source.Enumerate(nextURL, domain, tokens, results, config) + source.Enumerate(nextURL, domain, tokens, results, cfg) } } } +func (source *Source) Name() string { + return sources.GITHUB +} + +var up = hqgourl.NewParser(hqgourl.ParserWithDefaultScheme("http")) + func getRawContentURL(htmlURL string) string { domain := strings.ReplaceAll(htmlURL, "https://github.com/", "https://raw.githubusercontent.com/") return strings.ReplaceAll(domain, "/blob/", "/") } - -func (source *Source) Name() string { - return "github" -} diff --git a/pkg/scraper/sources/github/tokenmanager.go b/pkg/xurlfind3r/sources/github/tokenmanager.go similarity index 100% rename from pkg/scraper/sources/github/tokenmanager.go rename to pkg/xurlfind3r/sources/github/tokenmanager.go diff --git a/pkg/scraper/sources/intelx/intelx.go b/pkg/xurlfind3r/sources/intelx/intelx.go similarity index 84% rename from pkg/scraper/sources/intelx/intelx.go rename to pkg/xurlfind3r/sources/intelx/intelx.go index 9b7cb93..e7fd28f 100644 --- a/pkg/scraper/sources/intelx/intelx.go +++ b/pkg/xurlfind3r/sources/intelx/intelx.go @@ -8,9 +8,9 @@ import ( "strings" "time" - "github.com/hueristiq/hqgourl" + hqgourl "github.com/hueristiq/hq-go-url" "github.com/hueristiq/xurlfind3r/pkg/httpclient" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" ) type searchRequest struct { @@ -34,20 +34,16 @@ type getResultsResponse struct { type Source struct{} -func (source *Source) Run(config *sources.Configuration, domain string) <-chan sources.Result { +func (source *Source) Run(cfg *sources.Configuration, domain string) <-chan sources.Result { results := make(chan sources.Result) go func() { defer close(results) - var err error - - var key string - - key, err = sources.PickRandom(config.Keys.Intelx) - if key == "" || err != nil { + key, err := cfg.Keys.IntelX.PickRandom() + if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -57,6 +53,8 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s return } + var searchRes *http.Response + parts := strings.Split(key, ":") if len(parts) != 2 { return @@ -86,7 +84,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s searchReqBodyBytes, err = json.Marshal(searchReqBody) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -96,12 +94,10 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s return } - var searchRes *http.Response - searchRes, err = httpclient.Post(searchReqURL, "", searchReqHeaders, bytes.NewBuffer(searchReqBodyBytes)) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -117,7 +113,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s if err = json.NewDecoder(searchRes.Body).Decode(&searchResData); err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -140,7 +136,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s getResultsRes, err = httpclient.Get(getResultsReqURL, "", nil) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -156,7 +152,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s if err = json.NewDecoder(getResultsRes.Body).Decode(&getResultsResData); err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -176,10 +172,10 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s URL := hostname.Selectvalue URL = sources.FixURL(URL) - parsedURL, err := hqgourl.Parse(URL) + parsedURL, err := up.Parse(URL) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -193,12 +189,12 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s URL = parsedURL.String() - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { + if !cfg.IsInScope(URL) { continue } result := sources.Result{ - Type: sources.URL, + Type: sources.ResultURL, Source: source.Name(), Value: URL, } @@ -212,5 +208,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s } func (source *Source) Name() string { - return "intelx" + return sources.INTELLIGENCEX } + +var up = hqgourl.NewParser(hqgourl.ParserWithDefaultScheme("http")) diff --git a/pkg/scraper/sources/otx/otx.go b/pkg/xurlfind3r/sources/otx/otx.go similarity index 69% rename from pkg/scraper/sources/otx/otx.go rename to pkg/xurlfind3r/sources/otx/otx.go index 7d91869..d7496e1 100644 --- a/pkg/scraper/sources/otx/otx.go +++ b/pkg/xurlfind3r/sources/otx/otx.go @@ -3,11 +3,9 @@ package otx import ( "encoding/json" "fmt" - "net/http" - "github.com/hueristiq/hqgourl" "github.com/hueristiq/xurlfind3r/pkg/httpclient" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" ) type getURLsResponse struct { @@ -34,36 +32,19 @@ type getURLsResponse struct { type Source struct{} -func (source *Source) Run(config *sources.Configuration, domain string) <-chan sources.Result { +func (source *Source) Run(cfg *sources.Configuration, domain string) <-chan sources.Result { results := make(chan sources.Result) go func() { defer close(results) - parseURL, err := hqgourl.Parse(domain) - if err != nil { - result := sources.Result{ - Type: sources.Error, - Source: source.Name(), - Error: err, - } - - results <- result - - return - } - for page := 1; ; page++ { - getURLsReqURL := fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/url_list?limit=100&page=%d", parseURL.ETLDPlusOne, page) - - var err error - - var getURLsRes *http.Response + getURLsReqURL := fmt.Sprintf("https://otx.alienvault.com/api/v1/indicators/domain/%s/url_list?limit=100&page=%d", domain, page) - getURLsRes, err = httpclient.SimpleGet(getURLsReqURL) + getURLsRes, err := httpclient.SimpleGet(getURLsReqURL) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -79,7 +60,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s if err = json.NewDecoder(getURLsRes.Body).Decode(&getURLsResData); err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -96,12 +77,12 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s for _, item := range getURLsResData.URLList { URL := item.URL - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { + if !cfg.IsInScope(URL) { continue } result := sources.Result{ - Type: sources.URL, + Type: sources.ResultURL, Source: source.Name(), Value: URL, } @@ -119,5 +100,5 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s } func (source *Source) Name() string { - return "otx" + return sources.OPENTHREATEXCHANGE } diff --git a/pkg/xurlfind3r/sources/result.go b/pkg/xurlfind3r/sources/result.go new file mode 100644 index 0000000..edfeff2 --- /dev/null +++ b/pkg/xurlfind3r/sources/result.go @@ -0,0 +1,19 @@ +package sources + +// Result represents the outcome of an operation or request, including the type of result, +// the source of the data, the actual value retrieved (if applicable), and any error encountered. +type Result struct { + Type ResultType // Specifies the type of result (e.g., a URL or an error). + Source string // Indicates the source from which the result was obtained (e.g., a specific API or service). + Value string // Holds the value of the result, such as a URL or any other data returned from the operation. + Error error // Holds any error that occurred during the operation, or nil if no error occurred. +} + +// ResultType defines the type of result using an integer type. It can represent different +// kinds of outcomes from an operation, such as a URL or an error. +type ResultType int + +const ( + ResultURL ResultType = iota // Represents a successful result containing a URL. + ResultError // Represents a result where an error occurred during the operation. +) diff --git a/pkg/xurlfind3r/sources/sources.go b/pkg/xurlfind3r/sources/sources.go new file mode 100644 index 0000000..e9a2f0d --- /dev/null +++ b/pkg/xurlfind3r/sources/sources.go @@ -0,0 +1,39 @@ +package sources + +// Source is an interface that defines methods for a data source. +// Each source is expected to implement a way to run an operation based on configuration and a domain, +// and provide its name. +type Source interface { + // Run starts the data collection or scanning process for a specific domain. + // It takes in a Configuration and a domain string as input and returns a channel + // that emits Result structs. The channel is used for sending results back asynchronously. + Run(config *Configuration, domain string) <-chan Result + + // Name returns the name of the source. This can be used to identify the data source + // implementing the interface. + Name() string +} + +// Constants representing the names of different data sources. +// These sources could be APIs or services that are used to gather information about domains. +const ( + BEVIGIL = "bevigil" // Bevigil is an OSINT (Open-Source Intelligence) source. + COMMONCRAWL = "commoncrawl" // Common Crawl is a source of web data, commonly used in domain searches. + GITHUB = "github" // GitHub source for finding code repositories and related metadata. + INTELLIGENCEX = "intelx" // Intelligence X, a search engine and data archive. + OPENTHREATEXCHANGE = "otx" // Open Threat Exchange, a collaborative platform for sharing threat intelligence. + URLSCAN = "urlscan" // URLScan.io, a service for scanning websites and collecting URLs. + WAYBACK = "wayback" // Wayback Machine, an internet archive to retrieve historical versions of websites. +) + +// List contains a collection of all available source names. +// This is useful for iterating over or referencing the supported data sources. +var List = []string{ + BEVIGIL, + COMMONCRAWL, + GITHUB, + INTELLIGENCEX, + OPENTHREATEXCHANGE, + URLSCAN, + WAYBACK, +} diff --git a/pkg/scraper/sources/urlscan/urlscan.go b/pkg/xurlfind3r/sources/urlscan/urlscan.go similarity index 80% rename from pkg/scraper/sources/urlscan/urlscan.go rename to pkg/xurlfind3r/sources/urlscan/urlscan.go index 760702a..a57bf51 100644 --- a/pkg/scraper/sources/urlscan/urlscan.go +++ b/pkg/xurlfind3r/sources/urlscan/urlscan.go @@ -2,12 +2,12 @@ package urlscan import ( "encoding/json" + "errors" "fmt" - "net/http" "strings" "github.com/hueristiq/xurlfind3r/pkg/httpclient" - "github.com/hueristiq/xurlfind3r/pkg/scraper/sources" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" "github.com/spf13/cast" ) @@ -29,20 +29,16 @@ type searchResponse struct { type Source struct{} -func (source *Source) Run(config *sources.Configuration, domain string) <-chan sources.Result { +func (source *Source) Run(cfg *sources.Configuration, domain string) <-chan sources.Result { results := make(chan sources.Result) go func() { defer close(results) - var err error - - var key string - - key, err = sources.PickRandom(config.Keys.URLScan) - if err != nil { + key, err := cfg.Keys.URLScan.PickRandom() + if err != nil && !errors.Is(err, sources.ErrNoKeys) { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -69,12 +65,10 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s searchReqURL += "&search_after=" + after } - var searchRes *http.Response - - searchRes, err = httpclient.Get(searchReqURL, "", searchReqHeaders) + searchRes, err := httpclient.Get(searchReqURL, "", searchReqHeaders) if err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -90,7 +84,7 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s if err = json.NewDecoder(searchRes.Body).Decode(&searchResData); err != nil { result := sources.Result{ - Type: sources.Error, + Type: sources.ResultError, Source: source.Name(), Error: err, } @@ -111,12 +105,12 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s for _, result := range searchResData.Results { URL := result.Page.URL - if !sources.IsInScope(URL, domain, config.IncludeSubdomains) { + if !cfg.IsInScope(URL) { continue } result := sources.Result{ - Type: sources.URL, + Type: sources.ResultURL, Source: source.Name(), Value: URL, } @@ -150,5 +144,5 @@ func (source *Source) Run(config *sources.Configuration, domain string) <-chan s } func (source *Source) Name() string { - return "urlscan" + return sources.URLSCAN } diff --git a/pkg/scraper/sources/utils.go b/pkg/xurlfind3r/sources/utils.go similarity index 70% rename from pkg/scraper/sources/utils.go rename to pkg/xurlfind3r/sources/utils.go index 16c88fe..eac1307 100644 --- a/pkg/scraper/sources/utils.go +++ b/pkg/xurlfind3r/sources/utils.go @@ -1,64 +1,10 @@ package sources import ( - "crypto/rand" - "fmt" - "math/big" "net/url" "strings" - - "github.com/hueristiq/hqgourl" ) -func PickRandom[T any](v []T) (picked T, err error) { - length := len(v) - - if length == 0 { - return - } - - max := big.NewInt(int64(length)) - - var indexBig *big.Int - - indexBig, err = rand.Int(rand.Reader, max) - if err != nil { - err = fmt.Errorf("failed to generate random index: %w", err) - - return - } - - index := indexBig.Int64() - - picked = v[index] - - return -} - -func IsInScope(URL, domain string, includeSubdomains bool) (isInScope bool) { - parsedURL, err := hqgourl.Parse(URL) - if err != nil { - return - } - - parsedDomain, err := hqgourl.Parse(domain) - if err != nil { - return - } - - if parsedURL.ETLDPlusOne != parsedDomain.ETLDPlusOne { - return - } - - if !includeSubdomains && parsedURL.Domain != parsedDomain.Domain && parsedURL.Domain != "www."+parsedDomain.Domain { - return - } - - isInScope = true - - return -} - func FixURL(URL string) (fixedURL string) { fixedURL = URL diff --git a/pkg/xurlfind3r/sources/utils_test.go b/pkg/xurlfind3r/sources/utils_test.go new file mode 100644 index 0000000..b3b1acc --- /dev/null +++ b/pkg/xurlfind3r/sources/utils_test.go @@ -0,0 +1,68 @@ +package sources + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestFixURL(t *testing.T) { + t.Run("Remove Quotes and Spaces", func(t *testing.T) { + url := "\" http://example.com/path \"" + expected := "http://example.com/path" + assert.Equal(t, expected, FixURL(url)) + }) + + t.Run("Remove Unbalanced Quotes", func(t *testing.T) { + url := "http://example.com/path?q=\"value" + expected := "http://example.com/path?q=" + assert.Equal(t, expected, FixURL(url)) + }) + + t.Run("Remove Unbalanced Brackets", func(t *testing.T) { + url := "http://example.com/path?q=[value" + expected := "http://example.com/path?q=" + assert.Equal(t, expected, FixURL(url)) + }) + + t.Run("Remove Trailing Semicolon", func(t *testing.T) { + url := "http://example.com/path?q=value;" + expected := "http://example.com/path?q=value" + assert.Equal(t, expected, FixURL(url)) + }) + + t.Run("Handle Escaped Characters", func(t *testing.T) { + url := "http://example.com/path%20with%20spaces" + expected := "http://example.com/path with spaces" + assert.Equal(t, expected, FixURL(url)) + }) +} + +func TestFindUnbalancedQuote(t *testing.T) { + t.Run("Balanced Quotes", func(t *testing.T) { + s := "text with 'balanced' quotes" + assert.Equal(t, -1, findUnbalancedQuote(s, '\'')) + }) + + t.Run("Unbalanced Quotes", func(t *testing.T) { + s := "text with 'unbalanced quotes" + assert.Equal(t, 10, findUnbalancedQuote(s, '\'')) + }) +} + +func TestFindUnbalancedBracket(t *testing.T) { + t.Run("Balanced Brackets", func(t *testing.T) { + s := "text with [balanced] brackets" + assert.Equal(t, -1, findUnbalancedBracket(s, '[', ']')) + }) + + t.Run("Unbalanced Opening Bracket", func(t *testing.T) { + s := "text with [unbalanced brackets" + assert.Equal(t, 10, findUnbalancedBracket(s, '[', ']')) + }) + + t.Run("Unbalanced Closing Bracket", func(t *testing.T) { + s := "text with unbalanced] brackets" + assert.Equal(t, 20, findUnbalancedBracket(s, '[', ']')) + }) +} diff --git a/pkg/xurlfind3r/sources/wayback/wayback.go b/pkg/xurlfind3r/sources/wayback/wayback.go new file mode 100644 index 0000000..d6fd32d --- /dev/null +++ b/pkg/xurlfind3r/sources/wayback/wayback.go @@ -0,0 +1,102 @@ +package wayback + +import ( + "encoding/json" + "fmt" + "net/http" + + hqgolimiter "github.com/hueristiq/hq-go-limiter" + hqgourl "github.com/hueristiq/hq-go-url" + "github.com/hueristiq/xurlfind3r/pkg/httpclient" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" +) + +type Source struct{} + +func (source *Source) Run(cfg *sources.Configuration, domain string) <-chan sources.Result { + results := make(chan sources.Result) + + go func() { + defer close(results) + + var err error + + for page := uint(0); ; page++ { + getURLsReqURL := fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=*.%s/*&output=json&collapse=urlkey&fl=timestamp,original,mimetype,statuscode,digest&pageSize=100&page=%d", domain, page) + + limiter.Wait() + + var getURLsRes *http.Response + + getURLsRes, err = httpclient.SimpleGet(getURLsReqURL) + if err != nil { + result := sources.Result{ + Type: sources.ResultError, + Source: source.Name(), + Error: err, + } + + results <- result + + httpclient.DiscardResponse(getURLsRes) + + break + } + + var getURLsResData [][]string + + if err = json.NewDecoder(getURLsRes.Body).Decode(&getURLsResData); err != nil { + result := sources.Result{ + Type: sources.ResultError, + Source: source.Name(), + Error: err, + } + + results <- result + + getURLsRes.Body.Close() + + break + } + + getURLsRes.Body.Close() + + // check if there's results, wayback's pagination response + // is not always correct when using a filter + if len(getURLsResData) == 0 { + break + } + + getURLsResData = getURLsResData[1:] + + for _, record := range getURLsResData { + URL := record[1] + + if !cfg.IsInScope(URL) { + continue + } + + result := sources.Result{ + Type: sources.ResultURL, + Source: source.Name(), + Value: URL, + } + + results <- result + } + } + }() + + return results +} + +func (source *Source) Name() string { + return sources.WAYBACK +} + +var limiter = hqgolimiter.New(&hqgolimiter.Configuration{ + RequestsPerMinute: 40, + MinimumDelayInSeconds: 30, +}) + +var up = hqgourl.NewParser(hqgourl.ParserWithDefaultScheme("http")) diff --git a/pkg/xurlfind3r/xurlfind3r.go b/pkg/xurlfind3r/xurlfind3r.go new file mode 100644 index 0000000..bc107a9 --- /dev/null +++ b/pkg/xurlfind3r/xurlfind3r.go @@ -0,0 +1,223 @@ +package xurlfind3r + +import ( + "regexp" + "sync" + + hqgourl "github.com/hueristiq/hq-go-url" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources/bevigil" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources/commoncrawl" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources/github" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources/intelx" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources/otx" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources/urlscan" + "github.com/hueristiq/xurlfind3r/pkg/xurlfind3r/sources/wayback" +) + +// Finder is the main structure that manages the interaction with OSINT sources. +// It holds the available data sources and the configuration used for searching. +type Finder struct { + // sources is a map of source names to their corresponding implementations. + // Each source implements the Source interface, which allows domain searches. + sources map[string]sources.Source + // configuration contains configuration options such as API keys + // and other settings needed by the data sources. + configuration *sources.Configuration + FilterRegex *regexp.Regexp + MatchRegex *regexp.Regexp +} + +func (finder *Finder) IsURLInScope(domain, URL string, subdomainsInScope bool) (URLInScope bool) { + parsedURL, err := up.Parse(URL) + if err != nil { + return + } + + if parsedURL.Domain == nil { + return + } + + ETLDPlusOne := parsedURL.Domain.Root + + if parsedURL.Domain.TopLevel != "" { + ETLDPlusOne += "." + parsedURL.Domain.TopLevel + } + + parsedDomain := dp.Parse(domain) + + expectedETLDPlusOne := parsedDomain.Root + if parsedDomain.TopLevel != "" { + expectedETLDPlusOne += "." + parsedDomain.TopLevel + } + + if ETLDPlusOne != expectedETLDPlusOne { + return + } + + if !subdomainsInScope && + parsedURL.Domain.String() != parsedDomain.String() && + parsedURL.Domain.String() != "www."+parsedDomain.String() { + + return + } + + URLInScope = true + + return +} + +// Find takes a domain name and starts the URLs search process across all +// the sources specified in the configuration. It returns a channel through which +// the search results (of type Result) are streamed asynchronously. +func (finder *Finder) Find(domain string) (results chan sources.Result) { + // Initialize the results channel where URLs findings are sent. + results = make(chan sources.Result) + + // Parse the given domain using a domain parser. + parsed := dp.Parse(domain) + + // Rebuild the domain as "root.tld" format. + domain = parsed.Root + "." + parsed.TopLevel + + finder.configuration.IsInScope = func(URL string) (isInScope bool) { + return finder.IsURLInScope(domain, URL, finder.configuration.IncludeSubdomains) + } + + // Launch a goroutine to perform the search concurrently across all sources. + go func() { + // Ensure the results channel is closed once all search operations complete. + defer close(results) + + // A thread-safe map to store already-seen URLs, avoiding duplicates. + seenURLs := &sync.Map{} + + // WaitGroup ensures all source goroutines finish before exiting. + wg := &sync.WaitGroup{} + + // Iterate over all the sources in the Finder. + for name := range finder.sources { + wg.Add(1) + + // Start a new goroutine for each source to fetch URLs concurrently. + go func(source sources.Source) { + // Decrement the WaitGroup counter when this goroutine completes. + defer wg.Done() + + // Call the source's Run method to start the subdomain search. + sResults := source.Run(finder.configuration, domain) + + // Process each result as it's received from the source. + for sResult := range sResults { + // If the result is a subdomain, process it. + if sResult.Type == sources.ResultURL { + // Check if the subdomain has already been seen using sync.Map. + _, loaded := seenURLs.LoadOrStore(sResult.Value, struct{}{}) + if loaded { + // If the subdomain is already in the map, skip it. + continue + } + + if (finder.MatchRegex != nil && !finder.MatchRegex.MatchString(sResult.Value)) || (finder.FilterRegex != nil && finder.MatchRegex == nil && finder.FilterRegex.MatchString(sResult.Value)) { + continue + } + } + + // Send the result down the results channel. + results <- sResult + } + }(finder.sources[name]) + } + + // Wait for all goroutines to finish before exiting. + wg.Wait() + }() + + // Return the channel that will stream URL results. + return +} + +// Configuration holds the configuration for Finder, including +// the sources to use, sources to exclude, and the necessary API keys. +type Configuration struct { + IncludeSubdomains bool + + // SourcesToUse is a list of source names that should be used for the search. + SourcesToUse []string + // SourcesToExclude is a list of source names that should be excluded from the search. + SourcesToExclude []string + // Keys contains the API keys for each data source. + Keys sources.Keys + + FilterPattern string + MatchPattern string +} + +var ( + // dp is a domain parser used to normalize domains into their root and top-level domain (TLD) components. + dp = hqgourl.NewDomainParser() + up = hqgourl.NewParser(hqgourl.ParserWithDefaultScheme("http")) +) + +// New creates a new Finder instance based on the provided Configuration. +// It initializes the Finder with the selected sources and ensures that excluded sources are not used. +func New(cfg *Configuration) (finder *Finder, err error) { + // Initialize a Finder instance with an empty map of sources and the provided configuration. + finder = &Finder{ + sources: map[string]sources.Source{}, + configuration: &sources.Configuration{ + IncludeSubdomains: cfg.IncludeSubdomains, + Keys: cfg.Keys, + }, + } + + if cfg.FilterPattern != "" { + finder.FilterRegex, err = regexp.Compile(cfg.FilterPattern) + if err != nil { + return + } + } + + if cfg.MatchPattern != "" { + finder.MatchRegex, err = regexp.Compile(cfg.MatchPattern) + if err != nil { + return + } + } + + // If no specific sources are provided, use the default list of all sources. + if len(cfg.SourcesToUse) < 1 { + cfg.SourcesToUse = sources.List + } + + // Loop through the selected sources and initialize each one + for _, source := range cfg.SourcesToUse { + // Depending on the source name, initialize the appropriate source and add it to the map. + switch source { + case sources.BEVIGIL: + finder.sources[source] = &bevigil.Source{} + case sources.COMMONCRAWL: + finder.sources[source] = &commoncrawl.Source{} + case sources.GITHUB: + finder.sources[source] = &github.Source{} + case sources.INTELLIGENCEX: + finder.sources[source] = &intelx.Source{} + case sources.OPENTHREATEXCHANGE: + finder.sources[source] = &otx.Source{} + case sources.URLSCAN: + finder.sources[source] = &urlscan.Source{} + case sources.WAYBACK: + finder.sources[source] = &wayback.Source{} + } + } + + // Remove any sources that are specified in the SourcesToExclude list. + for index := range cfg.SourcesToExclude { + source := cfg.SourcesToExclude[index] + + delete(finder.sources, source) + } + + // Return the Finder instance with all the selected sources. + return +}