Skip to content

Commit

Permalink
parser: handle period at end of url
Browse files Browse the repository at this point in the history
Fix parsing URL when encountering a period at the end of the url by
setting it as disallowed from being present at the end of a
URL.

Some characters are disallowed to be present at the end of URLs.
Presently, the period character is the only disallowed character.
A character is the last character in the URL if it is followed by
is_whitespace() or if it's the last character in the string.

Signed-off-by: kernelkind <kernelkind@gmail.com>
Tested-by: William Casarin <jb55@jb55.com>
Signed-off-by: William Casarin <jb55@jb5.com>
  • Loading branch information
kernelkind authored and jb55 committed Dec 28, 2023
1 parent 644124f commit a841b44
Showing 1 changed file with 50 additions and 3 deletions.
53 changes: 50 additions & 3 deletions src/content_parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,53 @@ int push_block(struct ndb_content_parser *p, struct note_block *block)
return 0;
}



static inline int next_char_is_whitespace(unsigned char *cur, unsigned char *end) {
unsigned char *next = cur + 1;

if (next > end)
return 0;

if (next == end)
return 1;

return is_whitespace(*next);
}

static inline int char_disallowed_at_end_url(char c)
{
return c == '.' || c == ',';

}

static int is_final_url_char(unsigned char *cur, unsigned char *end)
{
if (is_whitespace(*cur))
return 1;

if (next_char_is_whitespace(cur, end)) {
// next char is whitespace so this char could be the final char in the url
return char_disallowed_at_end_url(*cur);
}

// next char isn't whitespace so it can't be a final char
return 0;
}

static int consume_until_end_url(struct cursor *cur, int or_end) {
unsigned char *start = cur->p;

while (cur->p < cur->end) {
if (is_final_url_char(cur->p, cur->end))
return cur->p != start;

cur->p++;
}

return or_end;
}

static int consume_url_fragment(struct cursor *cur)
{
int c;
Expand All @@ -376,7 +423,7 @@ static int consume_url_fragment(struct cursor *cur)

cur->p++;

return consume_until_whitespace(cur, 1);
return consume_until_end_url(cur, 1);
}

static int consume_url_path(struct cursor *cur)
Expand All @@ -393,7 +440,7 @@ static int consume_url_path(struct cursor *cur)
while (cur->p < cur->end) {
c = *cur->p;

if (c == '?' || c == '#' || is_whitespace(c)) {
if (c == '?' || c == '#' || is_final_url_char(cur->p, cur->end)) {
return 1;
}

Expand All @@ -411,7 +458,7 @@ static int consume_url_host(struct cursor *cur)
while (cur->p < cur->end) {
c = *cur->p;
// TODO: handle IDNs
if (is_alphanumeric(c) || c == '.' || c == '-')
if ((is_alphanumeric(c) || c == '.' || c == '-') && !is_final_url_char(cur->p, cur->end))
{
count++;
cur->p++;
Expand Down

0 comments on commit a841b44

Please sign in to comment.