Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

builtin: implement, document and test url-parse #1715

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 0 additions & 8 deletions connect.c
Original file line number Diff line number Diff line change
Expand Up @@ -693,14 +693,6 @@ enum protocol {
PROTO_GIT
};

int url_is_local_not_ssh(const char *url)
{
const char *colon = strchr(url, ':');
const char *slash = strchr(url, '/');
return !colon || (slash && slash < colon) ||
(has_dos_drive_prefix(url) && is_valid_path(url));
}

static const char *prot_name(enum protocol protocol)
{
switch (protocol) {
Expand Down
1 change: 0 additions & 1 deletion connect.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ int git_connection_is_socket(struct child_process *conn);
int server_supports(const char *feature);
int parse_feature_request(const char *features, const char *feature);
const char *server_feature_value(const char *feature, size_t *len_ret);
int url_is_local_not_ssh(const char *url);

struct packet_reader;
enum protocol_version discover_version(struct packet_reader *reader);
Expand Down
1 change: 1 addition & 0 deletions remote.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "gettext.h"
#include "hex.h"
#include "remote.h"
#include "url.h"
#include "urlmatch.h"
#include "refs.h"
#include "refspec.h"
Expand Down
8 changes: 8 additions & 0 deletions url.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,11 @@ void str_end_url_with_slash(const char *url, char **dest)
free(*dest);
*dest = strbuf_detach(&buf, NULL);
}

int url_is_local_not_ssh(const char *url)
{
const char *colon = strchr(url, ':');
const char *slash = strchr(url, '/');
return !colon || (slash && slash < colon) ||
(has_dos_drive_prefix(url) && is_valid_path(url));
}
2 changes: 2 additions & 0 deletions url.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@ char *url_decode_parameter_value(const char **query);
void end_url_with_slash(struct strbuf *buf, const char *url);
void str_end_url_with_slash(const char *url, char **dest);

int url_is_local_not_ssh(const char *url);

#endif /* URL_H */
90 changes: 90 additions & 0 deletions urlmatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "hex-ll.h"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the Git mailing list, Ghanshyam Thakkar wrote (reply to this):

On Sun, 28 Apr 2024, Matheus Afonso Martins Moreira via GitGitGadget <[email protected]> wrote:
> From: Matheus Afonso Martins Moreira <[email protected]>
> 
> Define general parsing function that supports all Git URLs
> including scp style URLs such as hostname:~user/repo.
> Has the same interface as the URL normalization function
> and uses the same data structures, facilitating its use.
> It's adapted from the algorithm used to process URLs in connect.c,
> so it should support the same inputs.
> 
> Signed-off-by: Matheus Afonso Martins Moreira <[email protected]>
> ---
>  urlmatch.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  urlmatch.h |  1 +
>  2 files changed, 91 insertions(+)
> 
> diff --git a/urlmatch.c b/urlmatch.c
> index 1d0254abacb..5a442e31fa2 100644
> --- a/urlmatch.c
> +++ b/urlmatch.c
> @@ -3,6 +3,7 @@
>  #include "hex-ll.h"
>  #include "strbuf.h"
>  #include "urlmatch.h"
> +#include "url.h"
>  
>  #define URL_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
>  #define URL_DIGIT "0123456789"
> @@ -438,6 +439,95 @@ char *url_normalize(const char *url, struct url_info *out_info)
>  	return url_normalize_1(url, out_info, 0);
>  }
>  
> +enum protocol {
> +	PROTO_UNKNOWN = 0,
> +	PROTO_LOCAL,
> +	PROTO_FILE,
> +	PROTO_SSH,
> +	PROTO_GIT,
> +};
> +
> +static enum protocol url_get_protocol(const char *name, size_t n)
> +{
> +	if (!strncmp(name, "ssh", n))
> +		return PROTO_SSH;
> +	if (!strncmp(name, "git", n))
> +		return PROTO_GIT;
> +	if (!strncmp(name, "git+ssh", n)) /* deprecated - do not use */
> +		return PROTO_SSH;
> +	if (!strncmp(name, "ssh+git", n)) /* deprecated - do not use */
> +		return PROTO_SSH;
> +	if (!strncmp(name, "file", n))
> +		return PROTO_FILE;
> +	return PROTO_UNKNOWN;
> +}
> +
> +char *url_parse(const char *url_orig, struct url_info *out_info)
> +{
> +	struct strbuf url;
> +	char *host, *separator;
> +	char *detached, *normalized;
> +	enum protocol protocol = PROTO_LOCAL;
> +	struct url_info local_info;
> +	struct url_info *info = out_info? out_info : &local_info;
> +	bool scp_syntax = false;
> +
> +	if (is_url(url_orig)) {
> +		url_orig = url_decode(url_orig);
> +	} else {
> +		url_orig = xstrdup(url_orig);
> +	}
> +
> +	strbuf_init(&url, strlen(url_orig) + sizeof("ssh://"));
> +	strbuf_addstr(&url, url_orig);
> +
> +	host = strstr(url.buf, "://");
> +	if (host) {
> +		protocol = url_get_protocol(url.buf, host - url.buf);
> +		host += 3;
> +	} else {
> +		if (!url_is_local_not_ssh(url.buf)) {
> +			scp_syntax = true;
> +			protocol = PROTO_SSH;
> +			strbuf_insertstr(&url, 0, "ssh://");
> +			host = url.buf + 6;
> +		}
> +	}
Interesting. 

    `
    $ ./git url-parse -c protocol file:/test/test
    ssh
    `

seems like only having a single slash after the 'protocol:' prints
'ssh' always (I think this may not even be a valid url). After this 'else'
block, the url turns into 'ssh://file/test/test'. Will examine the details
later. Not that it's your code's doing, and rather the result of
url_is_local_not_ssh(). But just wanted to point this out and ask if this
should error out or is this an intended behavior that I can't figure out. 

Thanks.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the Git mailing list, Torsten Bögershausen wrote (reply to this):

[]
> Interesting.
>
>     `
>     $ ./git url-parse -c protocol file:/test/test
>     ssh
>     `
>
> seems like only having a single slash after the 'protocol:' prints
> 'ssh' always (I think this may not even be a valid url). After this 'else'
> block, the url turns into 'ssh://file/test/test'. Will examine the details
> later. Not that it's your code's doing, and rather the result of
> url_is_local_not_ssh(). But just wanted to point this out and ask if this
> should error out or is this an intended behavior that I can't figure out.

ssh is the correct answer, try something like

`git clone localhost:/home/myself/project/git.git`

It is the scp syntax, supported by Git as well.
From `man scp`

    scp copies files between hosts on a network.
    []
    The source and target may be specified as a local pathname,
    a remote host with optional path in the form
    [user@]host:[path],
    or a URI in the form scp://[user@]host[:port][/path].
    Local file names can be made explicit using absolute or relative pathnames
    to avoid scp treating file names containing ‘:’ as host specifiers.

So yes, they share similar problems
with the ':' that could mean different things when using the short form.

#include "strbuf.h"
#include "urlmatch.h"
#include "url.h"

#define URL_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
#define URL_DIGIT "0123456789"
Expand Down Expand Up @@ -438,6 +439,95 @@ char *url_normalize(const char *url, struct url_info *out_info)
return url_normalize_1(url, out_info, 0);
}

enum protocol {
PROTO_UNKNOWN = 0,
PROTO_LOCAL,
PROTO_FILE,
PROTO_SSH,
PROTO_GIT,
};

static enum protocol url_get_protocol(const char *name, size_t n)
{
if (!strncmp(name, "ssh", n))
return PROTO_SSH;
if (!strncmp(name, "git", n))
return PROTO_GIT;
if (!strncmp(name, "git+ssh", n)) /* deprecated - do not use */
return PROTO_SSH;
if (!strncmp(name, "ssh+git", n)) /* deprecated - do not use */
return PROTO_SSH;
if (!strncmp(name, "file", n))
return PROTO_FILE;
return PROTO_UNKNOWN;
}

char *url_parse(const char *url_orig, struct url_info *out_info)
{
struct strbuf url;
char *host, *separator;
char *detached, *normalized;
enum protocol protocol = PROTO_LOCAL;
struct url_info local_info;
struct url_info *info = out_info? out_info : &local_info;
bool scp_syntax = false;

if (is_url(url_orig)) {
url_orig = url_decode(url_orig);
} else {
url_orig = xstrdup(url_orig);
}

strbuf_init(&url, strlen(url_orig) + sizeof("ssh://"));
strbuf_addstr(&url, url_orig);

host = strstr(url.buf, "://");
if (host) {
protocol = url_get_protocol(url.buf, host - url.buf);
host += 3;
} else {
if (!url_is_local_not_ssh(url.buf)) {
scp_syntax = true;
protocol = PROTO_SSH;
strbuf_insertstr(&url, 0, "ssh://");
host = url.buf + 6;
}
}

/* path starts after ':' in scp style SSH URLs */
if (scp_syntax) {
separator = strchr(host, ':');
if (separator) {
if (separator[1] == '/')
strbuf_remove(&url, separator - url.buf, 1);
else
*separator = '/';
}
}

detached = strbuf_detach(&url, NULL);
normalized = url_normalize(detached, info);
free(detached);

if (!normalized) {
return NULL;
}

/* point path to ~ for URL's like this:
*
* ssh://host.xz/~user/repo
* git://host.xz/~user/repo
* host.xz:~user/repo
*
*/
if (protocol == PROTO_GIT || protocol == PROTO_SSH) {
if (normalized[info->path_off + 1] == '~')
info->path_off++;
}

return normalized;
}

static size_t url_match_prefix(const char *url,
const char *url_prefix,
size_t url_prefix_len)
Expand Down
1 change: 1 addition & 0 deletions urlmatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ struct url_info {
};

char *url_normalize(const char *, struct url_info *);
char *url_parse(const char *, struct url_info *);

struct urlmatch_item {
size_t hostmatch_len;
Expand Down