From 4808c3b6c8c282f696bf773378d25e8307cc32a5 Mon Sep 17 00:00:00 2001 From: Albert Adler <22015497+TheNaubit@users.noreply.github.com> Date: Wed, 6 Mar 2024 17:14:25 +0100 Subject: [PATCH] feat: add improved protocol detection and handle messy strings in makeURL --- README.md | 3 + src/lib/helpers.ts | 381 ++++++++++++++++++++++++++++++++++++++++++++ src/makeURL.test.ts | 46 ++++++ src/makeURL.ts | 114 +++++++------ src/types.ts | 32 +++- 5 files changed, 525 insertions(+), 51 deletions(-) create mode 100644 src/lib/helpers.ts diff --git a/README.md b/README.md index c68c45a..2a87296 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,9 @@ Feel free to test and explore and if later on you need more guidance, read the w 👌 1kB minified and gzipped + + 🔒 Reliable. Even when you pass really messed up strings, it finds a way to build a valid URL (just check the tests for some examples) + ✍️ TypeScript types provided diff --git a/src/lib/helpers.ts b/src/lib/helpers.ts new file mode 100644 index 0000000..2340951 --- /dev/null +++ b/src/lib/helpers.ts @@ -0,0 +1,381 @@ +import type { + IConfig, + IDomainData, + IProtocolData, + IURLArrayData +} from "../types"; + +/** + * Detects the domain in a given string. + * + * @param str - The string to detect the domain from. Important: To detect a valid domain, the URL must contain a non-relative protocol (http:// or https://). + * @returns An object containing information about the domain. + * @example + * detectDomainInString("https://example.com/blog"); + * // => { hasDomain: true, domain: "example.com" } + * detectDomainInString("http://example/blog"); + * // => { hasDomain: false, domain: "" } + */ +export function detectDomainInString(str: string): IDomainData { + // We will use "new URL(...)" to detect it + // If it throws an error, it means the string is not a valid URL, so probably it doesn't have a domain + try { + const url = new URL(str); + + // But not throwing in the constructor does not mean it has a valid domain. For example: "https://example/blog" does not throw + + // So we will check if the hostname has at least one dot and then we have greater confidence that it is a domain + if (url.hostname.split(".").length < 2) { + throw null; // We throw an error to be catched in the catch block + } + + return { + hasDomain: true, + domain: url.hostname + }; + } catch { + return { + hasDomain: false, + domain: "" + }; + } +} + +/** + * Detects the protocol in a given string. + * @param str - The string to detect the protocol from. + * @returns An object containing information about the protocol detection. + * @example + * detectProtocolInString("https://example.com/blog"); + * // => { hasProtocol: true, protocol: "https" } + * detectProtocolInString("example.com/blog"); + * // => { hasProtocol: false, protocol: "none" } + * detectProtocolInString("//example.com/blog"); + * // => { hasProtocol: true, protocol: "relative" } + * detectProtocolInString("//example/blog"); + * // => { hasProtocol: false, protocol: "relative" } + */ +export function detectProtocolInString(str: string): IProtocolData { + const cleanedStr = str.trim().toLowerCase(); + + if (cleanedStr.startsWith("//")) { + const urlToCheck = `https:${cleanedStr}`; // Since the domain detector function only works with URLs with a protocol, we add a fake non-relative protocol to the string + const detectDomainData = detectDomainInString(urlToCheck); + return { + hasProtocol: detectDomainData.hasDomain, // Even if it contains a valid relative protocol string, we can not count it as valid since it does not contain a valid domain + protocol: "relative" // It contains a valid protocol + }; + } + + // Regex that checks if the str starts with http or https + const hasProtocol = /^(http|https):\/\//.test(cleanedStr); + return { + hasProtocol, + protocol: hasProtocol + ? (cleanedStr.split("://")[0] as "http" | "https") + : "none" + }; +} + +/** + * Generates a temporary URL string by merging an array of fragments. + * This generated URL is not safe and should only be used for detecting the domain. + * Important: This should only be used when we need to detect stuff in the URL like in the `detectDomainInString` function. It should NEVER be used to generate a valid URL. For that you should use the `safeStringArrayToURLString` function. + * + * @param array - An array of string fragments to be merged. + * @param hasProtocol - A boolean indicating whether the array has a protocol. + * @returns The merged URL string. + * @example + * getUnsafeMergedURLString(["https://", "example.com", "blog"], true); + * // => "https://example.com/blog" + */ +function getUnsafeMergedURLString( + array: Array, + hasProtocol: boolean +): string { + // We need to generate a temporary URL to detect the domain + // This generated URL is not really safe, meaning it could contain + // unescaped character, wrong protocols, etc. But we don't care about that, since we only need it to detect the domain + // Just, don't use it for anything else + return ( + array + // If the array has a protocol, we skip the first fragment + .slice(hasProtocol ? 1 : 0) + // We filter out empty fragments, keep in mind this is only for detecting the domain, so we don't care about empty fragments + .filter(v => v.trim() !== "") + // We could join using the `.join`method, but we need more control + // over how to join, so we use the `.map` method + .map((v, index) => { + // If it is the first item or if it contains a dot (potentially a part of the domain), we don't join with "/" + if (index === 0 || v.startsWith(".")) return v; + // Anything else is joined with "/" + else return `/${v}`; + }) + // Previously I said "join", but it was more like modifying the fragment so we could just concatenate the array without any separator + .join("") + ); +} + +/** + * Extracts the protocol from an array of fragments based on the given protocol index. + * + * @param array - The array of fragments. + * @param protocolIndex - The index at which the protocol ends. + * @returns An array of fragments containing the extracted protocol. + * @example + * extractProtocolFromArray(["https://", "example", "", ".com", "blog"], 8); + * // => ["https://", "example.com", "blog"] + */ +function extractProtocolFromArray( + array: Array, + protocolIndex: number +): Array { + let currentLength = 0; + let protocolFragment = ""; + const returnedFragments: Array = []; + + array.forEach(fragment => { + // If the fragment is empty and there are already fragments in the returned fragments, we add the fragment to the returned fragments + // If there are no fragments in the returned fragments, we skip it + // since it could break our logic to handle the protocol and in any case it would be an empty fragment + if (fragment === "" && returnedFragments.length > 0) { + returnedFragments.push(fragment); + return; + } + // If the current length plus the length of the fragment is less than the protocol index, we add the fragment to the protocol fragment + if (currentLength + fragment.length <= protocolIndex) { + protocolFragment += fragment; + currentLength += fragment.length; + // If the current length is greater than the protocol index, we add the fragment to the returned fragments + } else if (currentLength < protocolIndex) { + const sliceIndex = protocolIndex - currentLength; + protocolFragment += fragment.slice(0, sliceIndex); + + if (sliceIndex < fragment.length) { + returnedFragments.push(protocolFragment); + protocolFragment = ""; + returnedFragments.push(fragment.slice(sliceIndex)); + } + + currentLength += sliceIndex; + // If the current length is equal to the protocol index, we add the fragment to the returned fragments + } else { + if (protocolFragment !== "") { + returnedFragments.push(protocolFragment); + protocolFragment = ""; + } + returnedFragments.push(fragment); + } + }); + + if (protocolFragment !== "") { + returnedFragments.push(protocolFragment); + protocolFragment = ""; + } + + return returnedFragments; +} + +/** + * Extracts the domain from an array of URL fragments. + * Important: The array entered must come from the extractProtocolFromArray function or the safeStringArrayAssembler function. + * + * @param array - The array of URL fragments. + * @param hasProtocolExtracted - A boolean indicating whether the protocol has been extracted from the URL. + * @returns An array of URL fragments with the domain extracted. + * @example + * extractDomainFromArray(["https://", "example", "", ".com", "blog"], true); + * // => ["https://", "example.com", "blog"] + * extractDomainFromArray(["example", "", ".com", "blog"], true); + * // => ["example.com", "blog"] + */ +function extractDomainFromArray( + array: Array, + hasProtocolExtracted: boolean +): Array { + // We need to generate a temporary URL to detect the domain + // This generated URL is not really safe, meaning it could contain + // unescaped character, wrong protocols, etc. But we don't care about that, since we only need it to detect the domain + // Just, don't use it for anything else + const tempURL = getUnsafeMergedURLString(array, hasProtocolExtracted); + + // Resulting URL has no protocol (we removed it if it was there to remove the case of relative protocols, incompatible with the `detectDomainInString` function) + // But the `detectDomainInString` function needs an input URL with a protocol, so we add a fake one + const domainData = detectDomainInString(`https://${tempURL}`); + + // If the URL does not contain a domain, there is nothing to "extract"/"sort", so we return the array as is + if (!domainData.hasDomain) return array; + + // if it contains a domain, we find the index position of the last character of the domain + const domainIndex = + tempURL.indexOf(domainData.domain) + domainData.domain.length; + + // We need to create an array with the domain fragments but without the protocol (if it has one) + const safeArray = array.slice(hasProtocolExtracted ? 1 : 0); + + let domainFragment = ""; + let currentLength = 0; + + // This will be the array of fragments we will return + let returnedFragments: Array = []; + + safeArray.forEach(fragment => { + // We will filter out empty strings + if (fragment === "") { + // We need to check first if the current length is greater or equal than the domain index, because that means we already found the full domain in the array + if (currentLength >= domainIndex) { + // If the returnedFragments has some item, that means we already found the domain and we already saved it, so we can just push the empty fragment into the array + if (returnedFragments.length > 0) { + returnedFragments.push(fragment); + } else { + // If the returnedFragments is empty, that means we haven't saved the domain yet into the array + // But since we already found it, we can just push it to the array + returnedFragments.push(domainFragment); + domainFragment = ""; + // And then we can push the empty fragment into the array + returnedFragments.push(fragment); + } + } + return; + } + + // If the current length plus the length of the fragment is less than the domain index, we add the fragment to the domain fragment + if (currentLength + fragment.length <= domainIndex) { + domainFragment += fragment; + currentLength += fragment.length; + } else if (currentLength < domainIndex) { + const sliceIndex = domainIndex - currentLength; + domainFragment += fragment.slice(0, sliceIndex); + + // If the sliceIndex is less than the length of the fragment, we push the domain fragment into the array and then we push the rest of the fragment + if (sliceIndex < fragment.length) { + returnedFragments.push(domainFragment); + domainFragment = ""; + returnedFragments.push(fragment.slice(sliceIndex)); + } + + currentLength += sliceIndex; + // If the current length is equal to the domain index, we add the fragment to the returned fragments + } else { + // If the domain fragment is not empty, we push it to the array + if (domainFragment !== "") { + returnedFragments.push(domainFragment); + domainFragment = ""; + } + + // Then we push the fragment to the array + returnedFragments.push(fragment); + } + }); + + // If the domain fragment is not empty, we push it to the array + if (domainFragment !== "") { + returnedFragments.push(domainFragment); + domainFragment = ""; + } + + // If the array had a protocol we have to add it back at the beginning + // of the array before returning it + if (hasProtocolExtracted) { + returnedFragments = [array[0], ...returnedFragments]; + } + + return returnedFragments; +} + +/** + * Assembles a safe string array by filtering out empty strings and detecting protocols. + * @param fragments - An array of strings representing URL fragments. + * @param config - The configuration object. + * @returns An object containing the assembled URL fragments, information about the presence of a protocol, and the detected protocol. + * @example + * safeStringArrayAssembler(["https://", "example.com", "blog"]); + * // => { array: ["https://", "example.com", "blog"], hasProtocol: true, protocol: "https" } + * safeStringArrayAssembler(["example.com", "blog"]); + * // => { array: ["example.com", "blog"], hasProtocol: false, protocol: "none" } + * safeStringArrayAssembler(["//", "example.com", "blog"]); + * // => { array: ["//", "example.com", "blog"], hasProtocol: true, protocol: "relative" } + * safeStringArrayAssembler(["https://example.com", "blog", ""]); + * // => { array: ["https://", "example.com", "blog"], hasProtocol: true, protocol: "https" } + */ +export function safeStringArrayAssembler( + fragments: Array, + config: IConfig +): IURLArrayData { + // We will filter out empty strings + let filteredFragments = [...fragments]; + + // If "allowEmptyPathSegments" is false, we can clean empty path segments + // but if it is true, we can not because those empty fragments will be joined later with slashes, so technically the final URL wouldn't be what we expect. + // For example, if we have ["example.com", "", "blog"], the final URL should be "example.com//blog" instead of "example.com/blog" if "allowEmptyPathSegments" is true + if (!config.allowEmptyPathSegments) { + filteredFragments = filteredFragments.filter( + fragment => fragment.trim() !== "" + ); + } + + // Now we need to detect if it contains a protocol + const potentiallyWrongJoinedURL = filteredFragments.join(""); // We don't use here the getUnsafeMergedURLString function because it would fail, since it requires to have already detected the protocol + const { hasProtocol, protocol } = detectProtocolInString( + potentiallyWrongJoinedURL + ); + + // We need to join all the fragments until the end of the protocol (if it has one) + let returnedFragments: Array = filteredFragments; + + if (hasProtocol) { + // If it has a protocol, we need to find the index of the end of the protocol + const protocolIndex = + potentiallyWrongJoinedURL.indexOf( + protocol === "relative" ? "//" : "://" + ) + (protocol === "relative" ? 2 : 3); + + // And then sort and merge items in the array so the first item is the whole protocol + returnedFragments = extractProtocolFromArray( + filteredFragments, + protocolIndex + ); + + // We now know that the first fragment is the protocol (if there are any fragments) + // We need to check if there is a second fragment and if there is one, we need to remove any leading slashes it might contain + // That way we can be sure the final URL won't contain things like "https:///example.com/blog" + if (returnedFragments.length > 1) { + returnedFragments[1] = returnedFragments[1].replace(/^\/*/, ""); + } + } + + // No matter if there is protocol or not, we need to check if there is a domain, and if there is one, we need to sort and merge items in the array so the domain is a single fragment in the first position (if there is no protocol) or in the second position (if there is a protocol) + returnedFragments = extractDomainFromArray(returnedFragments, hasProtocol); + + return { + array: returnedFragments, + hasProtocol, + protocol + }; +} + +/** + * Converts a safe string array to a URL string. + * IMPORTANT: This function assumes the array is safe, meaning it has been processed by the safeStringArrayAssembler function. + * The generated string is a potential valid URL but is not guaranteed to be valid. + * @param urlArrayData - The data containing the array and whether it has a protocol. + * @returns The URL string. + * @example + * safeStringArrayToURLString({ array: ["https://", "example.com", "blog"], hasProtocol: true, protocol: "https" }); + * // => "https://example.com/blog" + * safeStringArrayToURLString({ array: ["example.com", "blog"], hasProtocol: false, protocol: "none" }); + * // => "example.com/blog" + */ +export function safeStringArrayToURLString( + urlArrayData: IURLArrayData +): string { + const { array, hasProtocol } = urlArrayData; + + if (hasProtocol) { + // If it has a protocol, we know the first fragment is the protocol, so we skip it + const slicedArray = array.length > 1 ? array.slice(1) : []; + return `${array.length > 0 ? array[0] : ""}${slicedArray.join("/")}`; + } + + return array.join("/"); +} diff --git a/src/makeURL.test.ts b/src/makeURL.test.ts index 6161dd6..8000b93 100644 --- a/src/makeURL.test.ts +++ b/src/makeURL.test.ts @@ -658,6 +658,52 @@ describe("makeURL", () => { makeURL("//example.com", "blog", "post/1"); }).toThrow("The generated URL is not valid: //example.com/blog/post/1"); }); + + it("should detect the protocol even if it is splitted in several strings", () => { + // Set the configuration object to some default values + setMakeURLDefaultConfig({ + forceProtocol: "auto", + trailingSlash: "remove", + strict: true, + allowEmptyPathSegments: false + }); + + const url = makeURL( + "htt", + "p", + "s:", + "//example", + ".com", + "blog", + "post/1" + ); + expect(url).toBe("https://example.com/blog/post/1"); + }); + + it("should build valid URLs even if the strings provided are a mess", () => { + // Set the configuration object to some default values + setMakeURLDefaultConfig({ + forceProtocol: "auto", + trailingSlash: "remove", + strict: true, + allowEmptyPathSegments: true + }); + + const url = makeURL( + "htt", + "p", + "", + "s:", + "//example", + "", + ".com", + "", + " ", + "blog", + "post/1" + ); + expect(url).toBe("https://example.com//%20/blog/post/1"); + }); }); describe("setMakeURLDefaultConfig", () => { diff --git a/src/makeURL.ts b/src/makeURL.ts index 6d8d405..ea9ae66 100644 --- a/src/makeURL.ts +++ b/src/makeURL.ts @@ -1,3 +1,8 @@ +import { + detectDomainInString, + safeStringArrayAssembler, + safeStringArrayToURLString +} from "./lib/helpers"; import type { IParams, IConfig } from "./types"; export const BASE_DEFAULT_MAKE_URL_CONFIG: IConfig = { @@ -103,44 +108,31 @@ export default function makeURL( } }; - let isURLWithRelativeProtocol = false; + const safeStringArrayAssemblerData = safeStringArrayAssembler( + stringFragments, + safeParams.config + ); // First, clean the fragments by removing leading and trailing slashes, replacing spaces with dashes and encoding invalid characters - const cleanedFragments = stringFragments.map((fragment, index) => { - let baseStartOfFragment = ""; - let baseFragment = fragment.trim(); - if (index === 0) { - if (baseFragment.startsWith("http://")) { - baseStartOfFragment = "http://"; - baseFragment = baseFragment.slice(7); - } else if (baseFragment.startsWith("https://")) { - baseStartOfFragment = "https://"; - baseFragment = baseFragment.slice(8); - // If the URL starts with `//` and includes a dot, it means it's a URL with a relative protocol, so we need to extract the `//` and add it back after the cleaning - } else if (baseFragment.startsWith("//") && baseFragment.includes(".")) { - baseStartOfFragment = "//"; - baseFragment = baseFragment.slice(2); - isURLWithRelativeProtocol = true; + safeStringArrayAssemblerData.array = safeStringArrayAssemblerData.array.map( + (fragment, index) => { + // Since we used the safeStringArrayAssembler function, we know that the first fragment is the protocol (if there is any) + if (index === 0 && safeStringArrayAssemblerData.hasProtocol) { + // Since it is a protocol, we can just return it as is + return fragment; } + + // Since it does not contain a protocol, we need to clean it + return fragment + .split("/") + .map(f => encodeURIComponent(f).replace(/%3A/g, ":")) + .join("/"); } - return `${baseStartOfFragment}${baseFragment - .split("/") - .map(f => encodeURIComponent(f).replace(/%3A/g, ":")) - .join("/")}`; - // Replace leading and trailing slashes with nothing - // .replace(/^[\/\s]+|[\/\s]+$/g, "") - // Replace spaces with dashes - // .replace(/\s+/g, "-") - // Encode invalid characters - // eslint-disable-next-line no-useless-escape - // .replace(/[^a-zA-Z0-9-._~:/?#\[\]@!$&'()*+,;=]/g, match => { - // return encodeURIComponent(match); - // }) - }); + ); + // Check if the first fragment includes a protocol and if it doesnt, add the forceProtocol protocol (only if it's not set to "none") if ( - !cleanedFragments[0].includes("://") && - !isURLWithRelativeProtocol && + !safeStringArrayAssemblerData.hasProtocol && safeParams.config.forceProtocol !== "none" ) { // If the forceProtocol is set to "auto" or "auto-insecure", we need to check if the URL starts with a domain. If it does, we add the forceProtocol, if it doesn't, we don't add it @@ -151,23 +143,34 @@ export default function makeURL( // If `forceProtocol`is `auto-insecure`, we use `http` instead of `https` (if the URL starts with a domain) const shouldUseHttps = safeParams.config.forceProtocol === "auto"; - try { - // new URL works with strings without domain extensions (like "https://a") so to avoid adding the protocol to relative URLs, we will run the `new URL(...)` check only if the fragment contains a dot - if (cleanedFragments[0].includes(".")) { - new URL("https://" + cleanedFragments[0]); - // If it doesn't throw an error, it means the first fragment is a domain, so we add the protocol - cleanedFragments[0] = `${shouldUseHttps ? "https" : "http"}://${cleanedFragments[0]}`; - } - } catch { - // If it fails, it means the first fragment is not a domain, so we don't add the protocol + const temporalDomainInfo = detectDomainInString( + `https://${safeStringArrayToURLString(safeStringArrayAssemblerData)}` + ); // Add some protocol to the URL to make sure the `detectDomainInString` function works as expected, since we now know that the temporalJoinedURL has no protocol + + if (temporalDomainInfo.hasDomain) { + // Insert a new item at the beginning of the safeStringArrayAssemblerData array + safeStringArrayAssemblerData.array = [ + `${shouldUseHttps ? "https" : "http"}://`, + ...safeStringArrayAssemblerData.array + ]; + safeStringArrayAssemblerData.hasProtocol = true; + safeStringArrayAssemblerData.protocol = shouldUseHttps + ? "https" + : "http"; } } else { - cleanedFragments[0] = `${safeParams.config.forceProtocol}://${cleanedFragments[0]}`; + // Insert a new item at the beginning of the safeStringArrayAssemblerData array + safeStringArrayAssemblerData.array = [ + `${safeParams.config.forceProtocol}://`, + ...safeStringArrayAssemblerData.array + ]; + safeStringArrayAssemblerData.hasProtocol = true; + safeStringArrayAssemblerData.protocol = safeParams.config.forceProtocol; } } // Generate the URL by joining the fragments - let url = cleanedFragments.join("/"); + let url = safeStringArrayToURLString(safeStringArrayAssemblerData); // If the trailingSlash is set to "add" and the URL does not end with a slash, add it if (safeParams.config.trailingSlash === "add" && !url.endsWith("/")) { @@ -189,8 +192,22 @@ export default function makeURL( // We need to support relative protocol so before doing the replacement, check if the url starts with `//` and if it does, extract it and add it back after the replacement let relativeProtocol = ""; - if (isURLWithRelativeProtocol) { - relativeProtocol = "//"; + + // If the protocol is detected as relative, there are two options + if (safeStringArrayAssemblerData.protocol === "relative") { + // One, the URL contains the relative protocol and also contains a + // valid domain (or subdomain) + if (safeStringArrayAssemblerData.hasProtocol) { + // Then we treat it as a URl with a valid relative protocol + relativeProtocol = "//"; + // Or two, the URL contains the relative protocol but it does not contain + // a valid domain (or subdomain) + } else { + // Then we treat it as an absolute URL instead + relativeProtocol = "/"; + } + + // In any case, we remove the relative protocol from the URL url = url.slice(2); } @@ -297,10 +314,11 @@ export default function makeURL( // Important: If we are using a relative URL or an absolute URL without host, this will throw an error, so we should not use the `strict` mode in those cases if (safeParams.config.strict) { try { - const testedURL = new URL(url); + // This function checks if it is a valid URL and also if it contains a valid + // domain (or subdomain, etc...), so we can be sure that it is a valid URL + const testResultData = detectDomainInString(url); - // If the host does not include a dot, it means it's not a valid domain, so we throw an error - if (!testedURL.host.includes(".")) { + if (!testResultData.hasDomain) { throw null; } } catch { diff --git a/src/types.ts b/src/types.ts index 814c192..5a22d96 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,9 +1,22 @@ +export type TForceProtocol = + | "http" + | "https" + | "none" + | "auto" + | "auto-insecure"; + +export type TTrailingSlash = "add" | "remove"; + +export type TArraySerializer = "stringify" | "repeat" | "comma"; + +export type TProtocol = "http" | "https" | "relative" | "none"; + export interface IConfig { - forceProtocol: "http" | "https" | "none" | "auto" | "auto-insecure"; - trailingSlash: "add" | "remove"; + forceProtocol: TForceProtocol; + trailingSlash: TTrailingSlash; strict: boolean; allowEmptyPathSegments: boolean; - arraySerializer: "stringify" | "repeat" | "comma"; + arraySerializer: TArraySerializer; } export interface IParams { @@ -11,3 +24,16 @@ export interface IParams { hash: string; config: T; } +export interface IProtocolData { + hasProtocol: boolean; + protocol: TProtocol; +} + +export interface IDomainData { + hasDomain: boolean; + domain: string; +} + +export interface IURLArrayData extends IProtocolData { + array: Array; +}