Skip to content

Commit

Permalink
fixed leading double slash issue
Browse files Browse the repository at this point in the history
  • Loading branch information
s-ferri-fortop committed Oct 6, 2023
1 parent 45e1948 commit 5c286e0
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 1 deletion.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,6 @@ dist/
*venv*/

# test
.tox/
.tox/

.idea/
2 changes: 2 additions & 0 deletions src/protego.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ def _quote_path(self, path):
def _quote_pattern(self, pattern):
if pattern.startswith("https://") or pattern.startswith("http://"):
pattern = "/" + pattern
if pattern.startswith("//"):
pattern = "//" + pattern

# Corner case for query only (e.g. '/abc?') and param only (e.g. '/abc;') URLs.
# Save the last character otherwise, urlparse will kill it.
Expand Down
10 changes: 10 additions & 0 deletions tests/test_protego.py
Original file line number Diff line number Diff line change
Expand Up @@ -1122,6 +1122,16 @@ def test_bytestrings(self):

self.assertEqual("Protego.parse expects str, got bytes", str(context.exception))

def test_leading_double_slash_in_pattern(self):
content = "User-Agent: *\nDisallow: //folder/*\n"
rp = Protego.parse(content)
self.assertTrue(rp.can_fetch("http://example.com/", "FooBot"))
self.assertTrue(rp.can_fetch("http://example.com/folder", "FooBot"))
self.assertTrue(rp.can_fetch("http://example.com/folder/", "FooBot"))
self.assertTrue(rp.can_fetch("http://example.com/folder/page", "FooBot"))
self.assertTrue(rp.can_fetch("http://example.com//folder", "FooBot"))
self.assertFalse(rp.can_fetch("http://example.com//folder/page", "FooBot"))

def test_visit_time(self):
"""Some website specified allow time for crawling in UTC"""
content = "User-Agent: *\nVisit-time: 0200 0630\nUser-Agent: NoTime"
Expand Down

0 comments on commit 5c286e0

Please sign in to comment.