Add regex(3).

This commit is contained in:
Jonas 'Sortie' Termansen 2015-12-15 23:43:34 +01:00
parent 5a3e181975
commit 75808c637d
7 changed files with 1338 additions and 6 deletions

View File

@ -62,6 +62,10 @@ malloc/heap_init.o \
malloc/__heap_lock.o \
malloc/__heap_unlock.o \
malloc/__heap_verify.o \
regex/regcomp.o \
regex/regerror.o \
regex/regexec.o \
regex/regfree.o \
signal/sigaddset.o \
signal/sigandset.o \
signal/sigdelset.o \

177
libc/include/regex.h Normal file
View File

@ -0,0 +1,177 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2014, 2015.
This file is part of the Sortix C Library.
The Sortix C Library is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
The Sortix C Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
regex.h
Regular expressions.
*******************************************************************************/
#ifndef _REGEX_H
#define _REGEX_H
#include <sys/cdefs.h>
#include <sys/__/types.h>
#if defined(__is_sortix_libc)
#include <pthread.h>
#else
#include <__/pthread.h>
#endif
#ifndef __size_t_defined
#define __size_t_defined
#define __need_size_t
#include <stddef.h>
#endif
typedef __ssize_t regoff_t;
typedef struct
{
regoff_t rm_so;
regoff_t rm_eo;
} regmatch_t;
#if defined(__is_sortix_libc)
enum re_type
{
RE_TYPE_BOL,
RE_TYPE_EOL,
RE_TYPE_CHAR,
RE_TYPE_ANY_CHAR,
RE_TYPE_SET,
RE_TYPE_SUBEXPRESSION,
RE_TYPE_SUBEXPRESSION_END,
RE_TYPE_ALTERNATIVE,
RE_TYPE_OPTIONAL,
RE_TYPE_LOOP,
RE_TYPE_REPETITION,
/* TODO: Back-references. */
};
struct re;
struct re_char
{
char c;
};
struct re_set
{
unsigned char set[32];
};
struct re_subexpression
{
struct re* re_owner;
size_t index;
};
struct re_split
{
struct re* re;
struct re* re_owner;
};
struct re_repetition
{
struct re* re;
size_t min;
size_t max;
};
struct re
{
enum re_type re_type;
union
{
struct re_char re_char;
struct re_set re_set;
struct re_subexpression re_subexpression;
struct re_split re_split;
struct re_repetition re_repetition;
};
struct re* re_next;
struct re* re_next_owner;
struct re* re_current_state_prev;
struct re* re_current_state_next;
struct re* re_upcoming_state_next;
unsigned char re_is_currently_done;
unsigned char re_is_current;
unsigned char re_is_upcoming;
regmatch_t* re_matches;
};
#endif
typedef struct
{
size_t re_nsub;
#if defined(__is_sortix_libc)
pthread_mutex_t re_lock;
struct re* re;
regmatch_t* re_matches;
size_t re_state_count;
int re_cflags;
#else
__pthread_mutex_t __re_lock;
void* __re;
regmatch_t* __re_matches;
size_t __re_state_count;
int __re_cflags;
#endif
} regex_t;
#define REG_EXTENDED (1 << 0)
#define REG_ICASE (1 << 1)
#define REG_NOSUB (1 << 2)
#define REG_NEWLINE (1 << 3)
#define REG_NOTBOL (1 << 0)
#define REG_NOTEOL (1 << 1)
#define REG_NOMATCH 1
#define REG_BADPAT 2
#define REG_ECOLLATE 3
#define REG_ECTYPE 4
#define REG_EESCAPE 5
#define REG_ESUBREG 6
#define REG_EBRACK 7
#define REG_EPAREN 8
#define REG_EBRACE 9
#define REG_BADBR 10
#define REG_ERANGE 11
#define REG_ESPACE 12
#define REG_BADRPT 13
#ifdef __cplusplus
extern "C" {
#endif
int regcomp(regex_t* __restrict, const char* __restrict, int);
size_t regerror(int, const regex_t* __restrict, char* __restrict, size_t);
int regexec(const regex_t* __restrict, const char* __restrict, size_t,
regmatch_t* __restrict, int);
void regfree(regex_t*);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif

727
libc/regex/regcomp.cpp Normal file
View File

@ -0,0 +1,727 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2014, 2015.
This file is part of the Sortix C Library.
The Sortix C Library is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
The Sortix C Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
regex/regcomp.cpp
Regular expression compiler.
*******************************************************************************/
#include <assert.h>
#include <errno.h>
#include <inttypes.h>
#include <limits.h>
#include <regex.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
struct re_parse_subexpr
{
struct re_parse_subexpr* next;
struct re** prev_next_ptr;
struct re** primary_next_ptr;
};
struct re_parse
{
struct re_parse_subexpr* subexpr;
size_t subexpr_num;
};
static inline bool re_basic_well_defined_escape(char c)
{
return c == '\\' || c == '(' || c == ')' || c == '{' || c == '}' ||
c == '.' || c == '*' || c == '[' || c == ']' || c == '^' ||
c == '$' || c == '+' || c == '?' || c == '|' ||
('0' <= c && c <= '9');
}
static inline bool re_extended_well_defined_escape(char c)
{
return c == '\\' || c == '(' || c == ')' || c == '{' || c == '}' ||
c == '.' || c == '*' || c == '[' || c == ']' || c == '^' ||
c == '$' || c == '+' || c == '?' || c == '|';
}
static inline void re_free(struct re* re)
{
regex_t regex;
memset(&regex, 0, sizeof(regex));
pthread_mutex_init(&regex.re_lock, NULL);
regex.re = re;
regfree(&regex);
}
static inline int re_parse(struct re_parse* parse,
struct re** restrict prev_next_ptr,
const char* restrict pattern,
int cflags)
{
*prev_next_ptr = NULL;
bool is_extended = cflags & REG_EXTENDED;
bool is_basic = !is_extended;
struct re** primary_next_ptr = prev_next_ptr;
struct re* re;
size_t pattern_index = 0;
//size_t alternative_begun_at = pattern_index;
while ( true )
{
size_t c_pattern_index = pattern_index++;
char c = pattern[c_pattern_index];
if ( c == '\0' )
{
if ( parse->subexpr )
return REG_EPAREN;
return 0;
}
bool escaped = false;
if ( c == '\\' )
{
c_pattern_index = pattern_index++;
c = pattern[c_pattern_index];
if ( c == '\0' )
return REG_BADPAT;
if ( is_basic && !re_basic_well_defined_escape(c) )
return REG_BADPAT;
if ( is_extended && !re_extended_well_defined_escape(c) )
return REG_BADPAT;
escaped = true;
}
bool escaped_for_basic = (is_basic && escaped) ||
(is_extended && !escaped);
if ( escaped_for_basic && c == ')' )
{
struct re_parse_subexpr* subexpr = parse->subexpr;
if ( !subexpr )
return REG_EPAREN;
*prev_next_ptr = NULL;
prev_next_ptr = subexpr->prev_next_ptr;
primary_next_ptr = subexpr->primary_next_ptr;
//alternative_begun_at = subexpr->alternative_begun_at;
parse->subexpr = subexpr->next;
free(subexpr);
re = *prev_next_ptr;
goto subexpression_done;
}
// TODO: Properly reject anchors in the basic regular expression cases
// where they aren't appropriate. Mind that we implement the
// extension where all ERE features are available in BRE mode if
// accessed through backslashes.
//if ( !escaped && c == '^' &&
// (0 < parse->subexpr_depth || c_pattern_index != alternative_begun_at) )
// return REG_BADRPT;
//if ( !escaped && c == '$' &&
// (0 < parse->subexpr_depth || pattern[pattern_index] != '0') )
// return REG_BADRPT;
if ( !escaped && c == '*' )
return REG_BADRPT;
if ( escaped_for_basic && c == '{' )
return REG_BADBR;
if ( (is_basic && escaped && c == '+') ||
(is_extended && !escaped && c == '+') )
return REG_BADBR;
if ( (is_basic && escaped && c == '?') ||
(is_extended && !escaped && c == '?') )
return REG_BADBR;
if ( !(re = (struct re*) calloc(1, sizeof(struct re))) )
return REG_ESPACE;
if ( escaped_for_basic && c == '|' )
{
re->re_type = RE_TYPE_ALTERNATIVE;
re->re_next_owner = *primary_next_ptr;
re->re_split.re_owner = NULL;
*primary_next_ptr = re;
prev_next_ptr = primary_next_ptr = &re->re_split.re_owner;
continue;
}
// TODO: Check if this anchor logic is the right one. This uses them as
// special characters in BRE mode in cases they shouldn't be.
else if ( !escaped && c == '^' )
{
re->re_type = RE_TYPE_BOL;
*prev_next_ptr = re;
prev_next_ptr = &re->re_next_owner;
continue;
}
else if ( !escaped && c == '$' )
{
re->re_type = RE_TYPE_EOL;
*prev_next_ptr = re;
prev_next_ptr = &re->re_next_owner;
continue;
}
else if ( escaped_for_basic && c == '(' )
{
re->re_type = RE_TYPE_SUBEXPRESSION;
re->re_subexpression.index = parse->subexpr_num++;
re->re_subexpression.re_owner = NULL;
*prev_next_ptr = re;
struct re* end = (struct re*) calloc(1, sizeof(struct re));
if ( !end )
return REG_ESPACE;
end->re_type = RE_TYPE_SUBEXPRESSION_END;
end->re_subexpression.index = re->re_subexpression.index;
re->re_next_owner = end;
struct re_parse_subexpr* subexpr = (struct re_parse_subexpr*)
calloc(sizeof(struct re_parse_subexpr), 1);
if ( !subexpr )
return REG_ESPACE;
subexpr->prev_next_ptr = prev_next_ptr;
subexpr->primary_next_ptr = primary_next_ptr;
//subexpr->alternative_begun_at = alternative_begun_at;
subexpr->next = parse->subexpr;
parse->subexpr = subexpr;
prev_next_ptr = &re->re_subexpression.re_owner;
primary_next_ptr = &re->re_subexpression.re_owner;
//alternative_begun_at = pattern_index;
continue;
}
// TODO: This is not properly implemented.
// TODO: This is not properly unicode-aware.
else if ( c == '[' )
{
re->re_type = RE_TYPE_SET;
bool negate = false;
if ( pattern[pattern_index] == '^' )
{
pattern_index += 1;
negate = true;
}
while ( pattern[pattern_index] != ']' )
{
if ( pattern[pattern_index] == '\0' )
return free(re), REG_EBRACK;
// TODO: This is wrong and fragile.
unsigned char c_from;
unsigned char c_to;
if ( pattern[pattern_index + 1] == '-' )
{
c_from = (unsigned char) pattern[pattern_index + 0];
c_to = (unsigned char) pattern[pattern_index + 2];
pattern_index += 3;
}
else
{
c_from = (unsigned char) pattern[pattern_index + 0];
c_to = (unsigned char) pattern[pattern_index + 0];
pattern_index += 1;
}
for ( unsigned int uc = c_from; uc <= c_to; uc++ )
{
size_t byte_index = uc / 8;
size_t bit_index = uc % 8;
re->re_set.set[byte_index] |= (1 << bit_index);
}
}
if ( negate )
{
for ( size_t i = 0; i < 32; i++ )
re->re_set.set[i] = ~re->re_set.set[i];
}
if ( pattern[pattern_index++] != ']' )
return free(re), REG_EBRACK;
}
else if ( escaped && ('0' <= c && c <= '9') )
{
// TODO: This isn't implemented yet (not part of ERE).
return free(re), REG_BADPAT;
}
else if ( !escaped && c == '.' )
re->re_type = RE_TYPE_ANY_CHAR;
else
{
re->re_type = RE_TYPE_CHAR;
re->re_char.c = c;
}
*prev_next_ptr = re;
subexpression_done:
if ( (is_basic && pattern[pattern_index + 0] == '\\' &&
pattern[pattern_index + 1] == '{') ||
(is_extended && pattern[pattern_index] == '{' ) )
{
pattern_index += is_extended ? 1 : 2;
if ( pattern[pattern_index] < '0' ||
pattern[pattern_index] > '9' )
return REG_BADBR;
uintmax_t repeat_min;
uintmax_t repeat_max;
const char* value;
const char* value_end;
int saved_errno = errno;
value = (char*) (pattern + pattern_index);
repeat_min = strtoumax((char*) value, (char**) &value_end, 10);
int parse_errno = errno;
errno = saved_errno;
if ( parse_errno == ERANGE || SIZE_MAX < repeat_min )
return REG_BADBR;
pattern_index += value_end - value;
if ( pattern[pattern_index] == ',' )
{
repeat_max = SIZE_MAX;
pattern_index += 1;
if ( pattern[pattern_index] >= '0' &&
pattern[pattern_index] <= '9' )
{
saved_errno = errno;
value = (char*) (pattern + pattern_index);
repeat_max = strtoumax((char*) value, (char**) &value_end, 10);
parse_errno = errno;
errno = saved_errno;
if ( parse_errno == ERANGE || SIZE_MAX < repeat_max )
return REG_BADBR;
if ( repeat_max < repeat_min )
return REG_BADBR;
pattern_index += value_end - value;
}
}
else
{
repeat_max = repeat_min;
}
if ( (is_basic && pattern[pattern_index++] != '\\') ||
pattern[pattern_index++] != '}' )
return REG_BADBR;
struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
if ( !re_repetition )
return REG_ESPACE;
re_repetition->re_type = RE_TYPE_REPETITION;
re_repetition->re_repetition.re = re;
re_repetition->re_repetition.min = (size_t) repeat_min;
re_repetition->re_repetition.max = (size_t) repeat_max;
*prev_next_ptr = re_repetition;
re = re_repetition;
}
else if ( pattern[pattern_index] == '*' )
{
pattern_index += 1;
struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
if ( !re_repetition )
return REG_ESPACE;
re_repetition->re_type = RE_TYPE_REPETITION;
re_repetition->re_repetition.re = re;
re_repetition->re_repetition.min = 0;
re_repetition->re_repetition.max = SIZE_MAX;
*prev_next_ptr = re_repetition;
re = re_repetition;
}
else if ( (is_basic && pattern[pattern_index + 0] == '\\' &&
pattern[pattern_index + 1] == '?') ||
(is_extended && pattern[pattern_index] == '?' ) )
{
pattern_index += is_extended ? 1 : 2;
struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
if ( !re_repetition )
return REG_ESPACE;
re_repetition->re_type = RE_TYPE_REPETITION;
re_repetition->re_repetition.re = re;
re_repetition->re_repetition.min = 0;
re_repetition->re_repetition.max = 1;
*prev_next_ptr = re_repetition;
re = re_repetition;
}
else if ( (is_basic && pattern[pattern_index + 0] == '\\' &&
pattern[pattern_index + 1] == '+') ||
(is_extended && pattern[pattern_index] == '+' ) )
{
pattern_index += is_extended ? 1 : 2;
struct re* re_repetition = (struct re*) calloc(1, sizeof(struct re));
if ( !re_repetition )
return REG_ESPACE;
re_repetition->re_type = RE_TYPE_REPETITION;
re_repetition->re_repetition.re = re;
re_repetition->re_repetition.min = 1;
re_repetition->re_repetition.max = SIZE_MAX;
*prev_next_ptr = re_repetition;
re = re_repetition;
}
if ( re->re_type == RE_TYPE_SUBEXPRESSION )
re = re->re_next_owner; // RE_TYPE_SUBEXPRESSION_END.
prev_next_ptr = &re->re_next_owner;
}
}
static inline bool re_duplicate(struct re* templ, struct re** re_ptr)
{
struct re* copy;
struct re* parent_templ = NULL;
struct re* parent_copy = NULL;
while ( true )
{
if ( !templ )
{
if ( parent_templ )
{
templ = parent_templ;
copy = parent_copy;
parent_templ = templ->re_upcoming_state_next;
parent_copy = copy->re_upcoming_state_next;
templ = templ->re_next_owner;
re_ptr = &copy->re_next_owner;
continue;
}
return *re_ptr = NULL, true;
}
if ( !(copy = (struct re*) calloc(1, sizeof(struct re))) )
return false;
*re_ptr = copy;
copy->re_type = templ->re_type;
if ( templ->re_type == RE_TYPE_BOL )
;
else if ( templ->re_type == RE_TYPE_BOL )
;
else if ( templ->re_type == RE_TYPE_CHAR )
copy->re_char.c = templ->re_char.c;
else if ( templ->re_type == RE_TYPE_ANY_CHAR )
;
else if ( templ->re_type == RE_TYPE_SET )
memcpy(copy->re_set.set, templ->re_set.set, 32);
else if ( templ->re_type == RE_TYPE_SUBEXPRESSION )
{
copy->re_subexpression.index = templ->re_subexpression.index;
templ->re_upcoming_state_next = parent_templ;
copy->re_upcoming_state_next = parent_copy;
parent_templ = templ;
parent_copy = copy;
templ = templ->re_subexpression.re_owner;
re_ptr = &copy->re_subexpression.re_owner;
continue;
}
else if ( templ->re_type == RE_TYPE_SUBEXPRESSION_END )
copy->re_subexpression.index = templ->re_subexpression.index;
else if ( templ->re_type == RE_TYPE_ALTERNATIVE ||
templ->re_type == RE_TYPE_OPTIONAL ||
templ->re_type == RE_TYPE_LOOP )
{
templ->re_upcoming_state_next = parent_templ;
copy->re_upcoming_state_next = parent_copy;
parent_templ = templ;
parent_copy = copy;
templ = templ->re_split.re_owner;
re_ptr = &copy->re_split.re_owner;
continue;
}
else if ( templ->re_type == RE_TYPE_REPETITION )
{
copy->re_repetition.min = templ->re_repetition.min;
copy->re_repetition.max = templ->re_repetition.max;
templ->re_upcoming_state_next = parent_templ;
copy->re_upcoming_state_next = parent_copy;
parent_templ = templ;
parent_copy = copy;
templ = templ->re_split.re;
re_ptr = &copy->re_split.re;
continue;
}
else
assert(false);
templ = templ->re_next_owner;
re_ptr = &copy->re_next_owner;
}
}
static inline bool re_repetition(struct re* templ,
struct re** re_ptr,
size_t min,
size_t max,
struct re* after)
{
while ( true )
{
if ( !max )
return *re_ptr = after, true;
struct re* copy = (struct re*) calloc(1, sizeof(struct re));
if ( !copy )
return false;
*re_ptr = copy;
copy->re_type = templ->re_type;
if ( templ->re_type == RE_TYPE_BOL )
;
else if ( templ->re_type == RE_TYPE_BOL )
;
else if ( templ->re_type == RE_TYPE_CHAR )
copy->re_char.c = templ->re_char.c;
else if ( templ->re_type == RE_TYPE_ANY_CHAR )
;
else if ( templ->re_type == RE_TYPE_SET )
memcpy(copy->re_set.set, templ->re_set.set, 32);
else if ( templ->re_type == RE_TYPE_SUBEXPRESSION )
{
copy->re_subexpression.index = templ->re_subexpression.index;
if ( !re_duplicate(templ->re_subexpression.re_owner,
&copy->re_subexpression.re_owner) )
return false;
struct re* templ_end = templ->re_next_owner;
assert(templ_end && templ_end->re_type == RE_TYPE_SUBEXPRESSION_END);
struct re* end = (struct re*) calloc(1, sizeof(struct re));
if ( !end )
return false;
end->re_type = RE_TYPE_SUBEXPRESSION_END;
end->re_subexpression.index = templ_end->re_subexpression.index;
copy->re_next_owner = end;
}
else
assert(false);
if ( 1 <= min )
{
while ( copy->re_next_owner )
copy = copy->re_next_owner;
re_ptr = &copy->re_next_owner;
if ( max != SIZE_MAX )
max--;
min--;
}
else if ( max < SIZE_MAX )
{
struct re* wrap = (struct re*) calloc(1, sizeof(struct re));
if ( !wrap )
return false;
wrap->re_type = RE_TYPE_OPTIONAL;
wrap->re_split.re_owner = copy;
*re_ptr = wrap;
re_ptr = &wrap->re_next_owner;
max--;
}
else
{
struct re* wrap = (struct re*) calloc(1, sizeof(struct re));
if ( !wrap )
return false;
wrap->re_type = RE_TYPE_LOOP;
wrap->re_split.re_owner = copy;
*re_ptr = wrap;
re_ptr = &wrap->re_next_owner;
max = 0;
}
}
}
static inline bool re_transform(struct re** re_ptr, size_t* state_count_ptr)
{
if ( !*re_ptr )
{
struct re* re;
if ( !(re = (struct re*) calloc(1, sizeof(struct re))) )
return false;
re->re_type = RE_TYPE_BOL;
*re_ptr = re;
}
struct re** parent_ptr = NULL;
while ( *re_ptr )
{
struct re* re = *re_ptr;
if ( re->re_type == RE_TYPE_REPETITION )
{
struct re* templ = re->re_repetition.re;
size_t min = re->re_repetition.min;
size_t max = re->re_repetition.max;
struct re* after = re->re_next_owner;
struct re* replacement = NULL;
re->re_next_owner = NULL;
re_repetition(templ, &replacement, min, max, after);
re_free(re);
*re_ptr = re = replacement;
continue;
}
(*state_count_ptr)++;
if ( re->re_type == RE_TYPE_SUBEXPRESSION &&
re->re_subexpression.re_owner )
{
re->re_current_state_prev = (struct re*) parent_ptr;
parent_ptr = re_ptr;
re_ptr = &re->re_subexpression.re_owner;
continue;
}
if ( (re->re_type == RE_TYPE_ALTERNATIVE ||
re->re_type == RE_TYPE_OPTIONAL ||
re->re_type == RE_TYPE_LOOP) && re->re_split.re_owner )
{
re->re_current_state_prev = (struct re*) parent_ptr;
parent_ptr = re_ptr;
re_ptr = &re->re_split.re_owner;
continue;
}
re_ptr = &re->re_next_owner;
while ( !*re_ptr && parent_ptr )
{
re_ptr = parent_ptr;
parent_ptr = (struct re**) (*re_ptr)->re_current_state_prev;
re_ptr = &(*re_ptr)->re_next_owner;
}
}
return true;
}
static inline void re_control_flow(struct re* re,
regmatch_t* matches,
size_t matches_per_state,
size_t* state_count_ptr)
{
struct re* parent = NULL;
struct re* parent_link = NULL;
while ( re )
{
size_t re_index = (*state_count_ptr)++;
size_t offset = re_index * matches_per_state;
re->re_matches = matches + offset;
if ( re->re_type == RE_TYPE_ALTERNATIVE )
{
if ( !re->re_split.re_owner )
re->re_split.re = parent_link;
if ( !re->re_next_owner )
re->re_next = parent_link;
if ( re->re_split.re_owner && re->re_next_owner )
{
re->re_next = re->re_next_owner;
re->re_current_state_prev = parent;
re->re_current_state_next = parent_link;
re->re_upcoming_state_next = re->re_next_owner;
parent = re;
re = re->re_split.re = re->re_split.re_owner;
}
else if ( re->re_split.re_owner )
re = re->re_split.re = re->re_split.re_owner;
else if ( re->re_next_owner )
re = re->re_next = re->re_next_owner;
else if ( parent )
{
re = parent;
parent = re->re_current_state_prev;
parent_link = re->re_current_state_next;
re = re->re_upcoming_state_next;
}
else
re = NULL;
continue;
}
if ( !re->re_next_owner && parent_link )
re->re_next = parent_link;
else
re->re_next = re->re_next_owner;
if ( re->re_type == RE_TYPE_LOOP || re->re_type == RE_TYPE_OPTIONAL )
{
struct re* inner = re->re_split.re_owner;
struct re* after = re->re_next;
re->re_split.re = after;
re->re_next = inner;
if ( re->re_next_owner )
{
re->re_current_state_prev = parent;
re->re_current_state_next = parent_link;
re->re_upcoming_state_next = after;
parent = re;
}
if ( re->re_type == RE_TYPE_LOOP )
parent_link = re;
else
parent_link = after;
re = inner;
continue;
}
if ( re->re_type == RE_TYPE_SUBEXPRESSION )
{
if ( re->re_subexpression.re_owner )
{
re->re_current_state_prev = parent;
re->re_current_state_next = parent_link;
re->re_upcoming_state_next = re->re_next_owner;
parent = re;
parent_link = re->re_next;
re->re_next = re->re_subexpression.re_owner;
re = re->re_subexpression.re_owner;
continue;
}
}
if ( !re->re_next_owner && parent )
{
re = parent;
parent = re->re_current_state_prev;
parent_link = re->re_current_state_next;
}
re = re->re_next_owner;
}
}
extern "C"
int regcomp(regex_t* restrict regex,
const char* restrict pattern,
int cflags)
{
// TODO: Verify cflags.
// TODO: Implement REG_ICASE.
// TODO: Implement REG_NOSUB.
// TODO: Implement REG_NEWLINE.
memset(regex, 0, sizeof(*regex));
pthread_mutex_init(&regex->re_lock, NULL);
regex->re_cflags = cflags;
struct re_parse parse;
memset(&parse, 0, sizeof(parse));
parse.subexpr_num = 1;
int ret = re_parse(&parse, &regex->re, pattern, cflags);
while ( parse.subexpr )
{
struct re_parse_subexpr* todelete = parse.subexpr;
parse.subexpr = todelete->next;
free(todelete);
}
if ( ret != 0 )
return regfree(regex), ret;
size_t state_count = 0;
if ( !re_transform(&regex->re, &state_count) )
return regfree(regex), REG_ESPACE;
size_t matches_length;
if ( __builtin_mul_overflow(parse.subexpr_num, state_count, &matches_length) )
return regfree(regex), REG_ESPACE;
regex->re_matches = (regmatch_t*)
reallocarray(NULL, matches_length, sizeof(regmatch_t));
if ( !regex->re_matches )
return regfree(regex), REG_ESPACE;
size_t state_recount = 0;
re_control_flow(regex->re, regex->re_matches, parse.subexpr_num, &state_recount);
assert(state_count == state_recount);
if ( !(cflags & REG_NOSUB) )
regex->re_nsub = parse.subexpr_num - 1;
return ret;
}

56
libc/regex/regerror.cpp Normal file
View File

@ -0,0 +1,56 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2014, 2015.
This file is part of the Sortix C Library.
The Sortix C Library is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
The Sortix C Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
regex/regerror.cpp
Regular expression error reporting.
*******************************************************************************/
#include <regex.h>
#include <stdio.h>
#include <string.h>
extern "C"
size_t regerror(int errnum,
const regex_t* restrict regex,
char* restrict errbuf,
size_t errbuf_size)
{
(void) regex;
const char* msg = "Unknown regular expression error";
switch ( errnum )
{
case REG_NOMATCH: msg = "Regular expression does not match"; break;
case REG_BADPAT: msg = "Invalid regular expression"; break;
case REG_ECOLLATE: msg = "Invalid collating element referenced"; break;
case REG_ECTYPE: msg = "Invalid character class type referenced"; break;
case REG_EESCAPE: msg = "Trailing <backslash> character in pattern"; break;
case REG_ESUBREG: msg = "Number in \\digit invalid or in error"; break;
case REG_EBRACK: msg = "\"[]\" imbalance"; break;
case REG_EPAREN: msg = "\"\\(\\)\" or \"()\" imbalance"; break;
case REG_EBRACE: msg = "\"\\{\\}\" imbalance"; break;
case REG_BADBR: msg = "Content of \"\\{\\}\" invalid: not a number, number too large, more than two numbers, first larger than second"; break;
case REG_ERANGE: msg = "Invalid endpoint in range expression"; break;
case REG_ESPACE: msg = "Out of memory"; break;
case REG_BADRPT: msg = "'?', '*', or '+' not preceded by valid regular expression"; break;
}
if ( errbuf_size )
strlcpy(errbuf, msg, errbuf_size);
return strlen(msg) + 1;
}

253
libc/regex/regexec.cpp Normal file
View File

@ -0,0 +1,253 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2014, 2015.
This file is part of the Sortix C Library.
The Sortix C Library is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
The Sortix C Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
regex/regexec.cpp
Regular expression execution.
*******************************************************************************/
#include <assert.h>
#include <regex.h>
#include <pthread.h>
#define QUEUE_CURRENT_STATE(new_state) \
{ \
if ( !new_state ) \
{ \
match = true; \
for ( struct re* re = state->re_current_state_next; \
re; \
re = re->re_current_state_next ) \
re->re_is_current = 0; \
state->re_current_state_next = NULL; \
current_states_last = state; \
} \
else if ( !(new_state->re_is_current && new_state->re_is_currently_done) ) \
{ \
if ( new_state->re_is_current ) \
{ \
if ( new_state->re_current_state_prev ) \
new_state->re_current_state_prev->re_current_state_next = \
new_state->re_current_state_next; \
else \
current_states = new_state->re_current_state_next; \
if ( new_state->re_current_state_next ) \
new_state->re_current_state_next->re_current_state_prev = \
new_state->re_current_state_prev; \
else \
current_states_last = new_state->re_current_state_prev; \
} \
new_state->re_current_state_prev = state; \
new_state->re_current_state_next = state->re_current_state_next; \
if ( state->re_current_state_next ) \
state->re_current_state_next->re_current_state_prev = new_state; \
else \
current_states_last = new_state; \
state->re_current_state_next = new_state; \
new_state->re_is_currently_done = 0; \
new_state->re_is_current = 1; \
new_state->re_is_upcoming = 0; \
for ( size_t m = 0; m < nmatch; m++ ) \
new_state->re_matches[m] = state->re_matches[m]; \
} \
} \
#define QUEUE_UPCOMING_STATE(new_state) \
{ \
if ( !new_state ) \
{ \
consumed_char = true; \
match = true; \
for ( struct re* re = state->re_current_state_next; \
re; \
re = re->re_current_state_next ) \
re->re_is_current = 0; \
state->re_current_state_next = NULL; \
current_states_last = state; \
} \
else if ( !new_state->re_is_upcoming ) \
{ \
if ( !upcoming_states ) \
upcoming_states = new_state; \
if ( upcoming_states_last ) \
upcoming_states_last->re_upcoming_state_next = new_state; \
upcoming_states_last = new_state; \
new_state->re_upcoming_state_next = NULL; \
new_state->re_is_upcoming = 1; \
for ( size_t m = 0; m < nmatch; m++ ) \
new_state->re_matches[m] = state->re_matches[m]; \
} \
} \
extern "C"
int regexec(const regex_t* restrict regex_const,
const char* restrict string,
size_t nmatch,
regmatch_t* restrict pmatch,
int eflags)
{
// TODO: Sanitize eflags.
regex_t* regex = (regex_t*) regex_const;
pthread_mutex_lock(&regex->re_lock);
if ( regex->re_cflags & REG_NOSUB )
nmatch = 0;
for ( size_t i = 0; i < nmatch; i++ )
{
pmatch[i].rm_so = -1;
pmatch[i].rm_eo = -1;
}
if ( regex->re_nsub + 1 < nmatch )
nmatch = regex->re_nsub + 1;
int result = REG_NOMATCH;
struct re* current_states = NULL;
struct re* current_states_last = NULL;
struct re* upcoming_states = NULL;
struct re* upcoming_states_last = NULL;
regex->re->re_is_current = 0;
for ( size_t i = 0; true; i++ )
{
if ( !regex->re->re_is_current && result == REG_NOMATCH )
{
if ( current_states_last )
current_states_last->re_current_state_next = regex->re;
else
current_states = regex->re;
regex->re->re_current_state_prev = current_states_last;
regex->re->re_current_state_next = NULL;
current_states_last = regex->re;
regex->re->re_is_currently_done = 0;
regex->re->re_is_current = 1;
regex->re->re_is_upcoming = 0;
for ( size_t m = 0; m < nmatch; m++ )
{
regex->re->re_matches[m].rm_so = m == 0 ? i : -1;
regex->re->re_matches[m].rm_eo = -1;
}
}
char c = string[i];
for ( struct re* state = current_states;
state;
state = state->re_current_state_next )
{
bool match = false;
bool consumed_char = false;
if ( state->re_type == RE_TYPE_BOL )
{
if ( !(eflags & REG_NOTBOL) )
QUEUE_CURRENT_STATE(state->re_next);
}
else if ( state->re_type == RE_TYPE_EOL )
{
if ( !(eflags & REG_NOTEOL) && c == '\0' )
QUEUE_CURRENT_STATE(state->re_next);
}
else if ( state->re_type == RE_TYPE_CHAR )
{
if ( c != '\0' && state->re_char.c == c )
QUEUE_UPCOMING_STATE(state->re_next);
}
else if ( state->re_type == RE_TYPE_ANY_CHAR )
{
if ( c != '\0' )
QUEUE_UPCOMING_STATE(state->re_next);
}
else if ( state->re_type == RE_TYPE_SET )
{
unsigned char uc = c;
if ( c != '\0' && (state->re_set.set[uc / 8] & (1 << (uc % 8))) )
QUEUE_UPCOMING_STATE(state->re_next);
}
else if ( state->re_type == RE_TYPE_SUBEXPRESSION )
{
size_t index = state->re_subexpression.index;
state->re_matches[index].rm_so = i;
QUEUE_CURRENT_STATE(state->re_next);
}
else if ( state->re_type == RE_TYPE_SUBEXPRESSION_END )
{
size_t index = state->re_subexpression.index;
state->re_matches[index].rm_eo = i;
QUEUE_CURRENT_STATE(state->re_next);
}
else if ( state->re_type == RE_TYPE_ALTERNATIVE ||
state->re_type == RE_TYPE_OPTIONAL ||
state->re_type == RE_TYPE_LOOP )
{
QUEUE_CURRENT_STATE(state->re_split.re);
QUEUE_CURRENT_STATE(state->re_next);
}
state->re_is_currently_done = 1;
if ( match )
{
state->re_matches[0].rm_eo = i + consumed_char;
for ( size_t m = 0; m < nmatch; m++ )
pmatch[m] = state->re_matches[m];
result = 0;
if ( nmatch == 0 )
break;
}
}
for ( struct re* re = current_states; re; re = re->re_current_state_next )
re->re_is_current = 0;
if ( nmatch == 0 && result == 0 )
{
for ( struct re* re = upcoming_states; re; re = re->re_upcoming_state_next )
re->re_is_upcoming = 0;
break;
}
current_states = upcoming_states;
if ( current_states )
current_states->re_current_state_prev = NULL;
current_states_last = upcoming_states_last;
for ( struct re* re = current_states; re; re = re->re_current_state_next )
{
re->re_is_currently_done = 0;
re->re_is_current = 1;
re->re_is_upcoming = 0;
re->re_current_state_next = re->re_upcoming_state_next;
if ( re->re_current_state_next )
re->re_current_state_next->re_current_state_prev = re;
}
upcoming_states = NULL;
upcoming_states_last = NULL;
eflags |= REG_NOTBOL;
if ( current_states == NULL && result == 0 )
break;
if ( c == '\0' )
break;
}
pthread_mutex_unlock(&regex->re_lock);
return result;
}

72
libc/regex/regfree.cpp Normal file
View File

@ -0,0 +1,72 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2014, 2015.
This file is part of the Sortix C Library.
The Sortix C Library is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
The Sortix C Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
regex/regfree.cpp
Regular expression freeing.
*******************************************************************************/
#include <regex.h>
#include <stdlib.h>
extern "C" void regfree(regex_t* regex)
{
struct re* parent = NULL;
struct re* re = regex->re;
while ( re )
{
if ( re->re_type == RE_TYPE_SUBEXPRESSION && re->re_subexpression.re_owner )
{
re->re_next = parent;
parent = re;
re = parent->re_subexpression.re_owner;
parent->re_subexpression.re_owner = NULL;
continue;
}
if ( (re->re_type == RE_TYPE_ALTERNATIVE ||
re->re_type == RE_TYPE_OPTIONAL ||
re->re_type == RE_TYPE_LOOP) &&
re->re_split.re_owner )
{
re->re_next = parent;
parent = re;
re = parent->re_split.re_owner;
parent->re_split.re_owner = NULL;
continue;
}
if ( re->re_type == RE_TYPE_REPETITION && re->re_repetition.re )
{
re->re_next = parent;
parent = re;
re = parent->re_repetition.re;
parent->re_repetition.re = NULL;
continue;
}
struct re* todelete = re;
re = re->re_next_owner;
if ( !re && parent )
{
re = parent;
parent = re->re_next;
}
free(todelete);
}
free(regex->re_matches);
pthread_mutex_destroy(&regex->re_lock);
}

View File

@ -24,6 +24,7 @@
#include <error.h>
#include <inttypes.h>
#include <locale.h>
#include <regex.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
@ -43,6 +44,14 @@ char* strdup_or_die(const char* str)
return result;
}
char* strndup_or_die(const char* str, size_t n)
{
char* result = strndup(str, n);
if ( !str )
error(2, errno, "strndup");
return result;
}
char* print_intmax_or_die(intmax_t value)
{
char value_string[sizeof(intmax_t) * 3];
@ -282,16 +291,50 @@ char* evaluate_mod(const char* a, const char* b)
return evaluate_integer_function(a, b, integer_mod);
}
// TODO: Implement regular expression pattern matching!
char* evaluate_match(const char* a, const char* b)
{
size_t b_length = strlen(b);
for ( size_t i = 0; i < b_length; i++ )
regex_t regex;
int status = regcomp(&regex, b, 0);
if ( status != 0 )
{
if ( b[i] != a[i] )
return strdup_or_die("0");
char errbuf[256];
const char* errmsg = errbuf;
char* erralloc = NULL;
size_t errbuf_needed;
if ( sizeof(errbuf) < (errbuf_needed = regerror(status, &regex, errbuf,
sizeof(errbuf))) )
{
if ( (erralloc = (char*) malloc(errbuf_needed)) )
{
errmsg = erralloc;
regerror(status, &regex, erralloc, errbuf_needed);
}
}
error(2, 0, "compiling regular expression: %s", errmsg);
free(erralloc);
}
return print_intmax_or_die((intmax_t) strlen(a));
char* result;
regmatch_t rm[2];
if ( regexec(&regex, a, 2, rm, 0) == 0 && rm[0].rm_so == 0 )
{
if ( 0 <= rm[1].rm_so )
result = strndup_or_die(a + rm[1].rm_so, rm[1].rm_eo - rm[1].rm_so);
else
result = print_intmax_or_die(rm[0].rm_eo);
}
else
{
if ( 0 < regex.re_nsub )
result = strdup_or_die("");
else
result = strdup_or_die("0");
}
regfree(&regex);
return result;
}
struct binary_operator