/* * Copyright (c) 2023 Jonas 'Sortie' Termansen. * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * glob/glob.c * Search for paths matching a pattern. */ #include #include #include #include #include #include #include #include #include #include #include #include static int strcoll_indirect(const void* a_ptr, const void* b_ptr) { const char* a = *(const char* const*) a_ptr; const char* b = *(const char* const*) b_ptr; return strcoll(a, b); } struct segment { char* prefix; size_t prefix_length; DIR* dir; bool trivial; bool leading_period; bool match_directory; bool done; union { struct { size_t start; size_t length; }; regex_t regex; }; }; int glob(const char* restrict pattern, int flags, int (*errfunc)(const char*, int), glob_t* restrict gl) { if ( !(flags & GLOB_DOOFFS) ) gl->gl_offs = 0; if ( !(flags & GLOB_APPEND) ) { gl->gl_pathv = NULL; gl->gl_pathc = 0; } if ( gl->gl_offs == SIZE_MAX ) return GLOB_NOSPACE; size_t initial_pathc = gl->gl_pathc; // Reserve room for at least one string and the trailing null to prevent // the possibly of late errors in the GLOB_NOCHECK case. size_t pathl; if ( __builtin_add_overflow(gl->gl_offs, gl->gl_pathc, &pathl) || __builtin_add_overflow(pathl, 2, &pathl) ) return GLOB_NOSPACE; char** new_pathv = reallocarray(gl->gl_pathv, pathl, sizeof(char*)); if ( !new_pathv ) return GLOB_NOSPACE; gl->gl_pathv = new_pathv; size_t paths_length = gl->gl_pathc + 1; // Parse the pattern into segments where trivial segments are fixed path // components that can be directly opened and non-trivial segments require // searching a directory for entries that match the pattern. struct segment* segments = NULL; size_t segments_count = 0; size_t segments_length = 0; int result = 0; for ( size_t offset = 0; pattern[offset] && !result; ) { // Combine multiple trivial path components into a trivial segment, but // each non-trivial path component must be its own segment. size_t segment_length = 0; bool is_trivial = true; for ( size_t i = 0; pattern[offset + i]; i++ ) { if ( pattern[offset + i] == '*' || pattern[offset + i] == '?' || pattern[offset + i] == '[' ) { if ( segment_length ) break; is_trivial = false; } if ( pattern[offset + i] == '/' || !pattern[offset + i + 1] ) { segment_length = i + 1; if ( !is_trivial ) break; } } // Grow the list of segments as needed. if ( segments_count == segments_length ) { size_t old_length = segments_length ? segments_length : 1; struct segment* new_segments = reallocarray(segments, old_length, 2 * sizeof(struct segment)); if ( !new_segments ) { result = GLOB_NOSPACE; break; } segments = new_segments; segments_length = 2 * old_length; } struct segment* segment = &segments[segments_count++]; segment->match_directory = pattern[offset + segment_length - 1] == '/'; // Trivial segments just contain the pattern indices to directly open. if ( (segment->trivial = is_trivial) ) { segment->start = offset; segment->length = segment_length; } // Non-trivial segments are translated to a regular expression that is // compiled right now so it can be efficiently reused during the search. else { // Match a leading period only if the pattern explicitly starts with // a period. POSIX requires that leading periods aren't matched by // the * and ? and [ operators, but also are not matched by negated // patterns like [^a]. It's unspecified whether [.] would match a // leading period. Although regular expressions can express such // patterns, it's difficult to translate, and it's much easier to // just special case the leading period like this. segment->leading_period = pattern[offset] == '.'; char* re = NULL; size_t re_size; FILE* fp = open_memstream(&re, &re_size); bool escaped = false; fputc('^', fp); // Translate the pattern to an extended regular expression. for ( size_t i = 0; fp && i < segment_length; i++ ) { unsigned char c = pattern[offset + i]; if ( !escaped && c == '*' ) fputs(".*", fp); else if ( !escaped && c == '?' ) fputs(".", fp); else if ( !escaped && c == '[' ) { // The whole character range is passed directly to regcomp // so the correct end just has to be found, taking the edge // cases into account. POSIX requires using ! instead of ^ // for character range negations. As an extension, ^ is // also just passed directly to regcomp and works. const char* expr = pattern + offset + i; size_t max = segment_length - i; size_t len = 1; if ( len < max && (expr[len] == '!' || expr[len] == '^') ) len++; if ( len < max && expr[len] == ']' ) len++; while ( len < max && expr[len] != ']' ) { if ( 2 <= max - len && expr[len] == '[' && (expr[len + 1] == '.' || expr[len + 1] == '=' || expr[len + 1] == ':' ) ) { char t = expr[len + 1]; len += 2; while ( 2 <= max - len && !(expr[len] == t && expr[len + 1] == ']') ) len++; len += max - len < 2 ? max - len : 2; } else len++; } if ( len < max && expr[len] == ']' ) { for ( size_t n = 0; n <= len; n++ ) { if ( n == 1 && expr[n] == '!' ) fputc('^', fp); else fputc((unsigned char) expr[n], fp); } i += len; } else fputs("\\[", fp); } else if ( !escaped && c == '\\' && !(flags & GLOB_NOESCAPE) ) escaped = true; else if ( c != '/' ) { if ( c == '\\' || c == '(' || c == ')' || c == '{' || c == '}' || c == '.' || c == '*' || c == '[' || c == ']' || c == '^' || c == '$' || c == '+' || c == '?' || c == '|' ) fputc('\\', fp); fputc(c, fp); escaped = false; } } fputc('$', fp); if ( !fp || ferror(fp) || fflush(fp) == EOF ) { if ( fp ) fclose(fp); free(re); result = GLOB_NOSPACE; segments_count--; break; } fclose(fp); // Compile and reuse the regular expression for this segment. int ret = regcomp(&segment->regex, re, REG_EXTENDED); free(re); if ( ret ) { result = GLOB_NOSPACE; segments_count--; break; } } offset += segment_length; } // Start the search with the first segment. if ( !result && segments_count ) { segments[0].prefix = NULL; segments[0].prefix_length = 0; segments[0].dir = NULL; segments[0].done = false; // If the first segment is non-trivial then the current working // directory needs to be opened and searched. if ( !segments[0].trivial && !(segments[0].dir = opendir(".")) ) { if ( errno == ENOMEM ) result = GLOB_NOSPACE; else if ( (errfunc && errfunc(".", errno)) || (flags & GLOB_ERR) ) result = GLOB_ABORTED; else segments[0].done = true; } } // Search the filesystem depth first for paths matching the pattern. The // segments array is used for the hierarchical state to avoid recursion. // Each active segment has a directory currently being searched and yields // paths to be explored by the subsequent segment. The last segment adds // paths to the output array if they match the pattern. The search is // complete when the outermost segment is done or has failed. size_t current_segment = 0; while ( segments_count && (current_segment || !(segments[0].done || result) )) { struct segment* segment = &segments[current_segment]; // Pop to the the parent segment if the directory has been searched or // if an error has happened and the search is aborting. if ( segment->done || result ) { free(segment->prefix); segment->prefix = NULL; if ( segment->dir ) closedir(segment->dir); current_segment--; continue; } char* name; size_t name_length; unsigned char type = DT_UNKNOWN; // A trivial segment yields only the singular path it can match. if ( segment->trivial ) { name = strndup(pattern + segment->start, segment->length); name_length = segment->length; segment->done = true; } // Search the directory for entries matching the pattern. else { errno = 0; struct dirent* entry = readdir(segment->dir); if ( !entry ) { const char* path = segment->prefix ? segment->prefix : "."; if ( errno == ENOMEM ) result = GLOB_NOSPACE; else if ( (errfunc && errfunc(path, errno)) || (flags & GLOB_ERR) ) result = GLOB_ABORTED; segment->done = true; continue; } // Skip known non-directories when a directory needs to be found. if ( (current_segment + 1 < segments_count || segment->match_directory) && entry->d_type != DT_UNKNOWN && entry->d_type != DT_DIR && entry->d_type != DT_LNK ) continue; if ( !strcmp(entry->d_name, ".") || !strcmp(entry->d_name, "..") ) continue; if ( entry->d_name[0] == '.' && !segment->leading_period ) continue; if ( regexec(&segment->regex, entry->d_name, 0, NULL, 0) ) continue; name = strdup(entry->d_name); name_length = strlen(entry->d_name); type = entry->d_type; } if ( !name ) { result = GLOB_NOSPACE; continue; } // Append the segment's prefix with the name but keep an extra byte for // a possible trailing slash and of course the terminating nul byte. size_t size = 0; if ( __builtin_add_overflow(segment->prefix_length, name_length, &size) || __builtin_add_overflow(size, 1 + 1, &size) ) { free(name); result = GLOB_NOSPACE; continue; } char* path = malloc(size); if ( !path ) { free(name); free(path); result = GLOB_NOSPACE; continue; } if ( segment->prefix_length ) memcpy(path, segment->prefix, segment->prefix_length); memcpy(path + segment->prefix_length, name, name_length); path[segment->prefix_length + name_length] = '\0'; int fd = segment->dir ? dirfd(segment->dir) : AT_FDCWD; // If this is not the last segment, push to the next segment to search // the directory just found. if ( current_segment + 1 < segments_count ) { struct segment* next_segment = &segments[current_segment + 1]; int mode = next_segment->trivial ? O_SEARCH : O_RDONLY; int subdirfd = openat(fd, name, mode | O_DIRECTORY | O_CLOEXEC); free(name); next_segment->dir = subdirfd < 0 ? NULL : fdopendir(subdirfd); if ( !next_segment->dir ) { if ( 0 <= subdirfd ) close(subdirfd); if ( errno != ENOENT && errno != ENOTDIR && ((errfunc && errfunc(path, errno)) || (flags & GLOB_ERR)) ) result = GLOB_ABORTED; free(path); continue; } next_segment->prefix = path; next_segment->prefix_length = size - 2; // Add a trailing slash to the searched directory entries. if ( !segment->trivial ) { next_segment->prefix[next_segment->prefix_length++] = '/'; next_segment->prefix[next_segment->prefix_length] = '\0'; } next_segment->done = false; current_segment++; continue; } // The last segment just needs to output paths if they exist. else { bool want_slash = (flags & GLOB_MARK) || segment->match_directory; bool exists = true, is_dir = false; // The path is known to already exist for non-trivial segments since // it was returned by readdir, but we may need to check if the path // is a directory if readdir didn't tell us already. if ( !segment->trivial && (!want_slash || (type != DT_UNKNOWN && type != DT_LNK)) ) is_dir = type == DT_DIR; // Just check if the path exists if we don't add slashes to dirs. else if ( !want_slash ) exists = !faccessat(fd, name, F_OK, AT_SYMLINK_NOFOLLOW); // Otherwise use the slower stat operation to obtain the inode type. else { struct stat st; exists = !fstatat(fd, name, &st, AT_SYMLINK_NOFOLLOW); if ( want_slash && S_ISLNK(st.st_mode) ) fstatat(fd, name, &st, 0); is_dir = S_ISDIR(st.st_mode); } free(name); if ( segment->match_directory && !is_dir ) { free(path); continue; } if ( want_slash && path[size - 3] != '/' ) path[size - 2] = '/', path[size - 1] = '\0'; if ( !exists ) { if ( errno != ENOENT && ((errfunc && errfunc(path, errno)) || (flags & GLOB_ERR)) ) result = GLOB_ABORTED; free(path); continue; } // Grow the output array as needed. if ( gl->gl_pathc == paths_length ) { size_t old_pathc = gl->gl_pathc ? gl->gl_pathc : 1; if ( __builtin_mul_overflow(2, old_pathc, &pathl) || __builtin_add_overflow(gl->gl_offs, pathl, &pathl) || __builtin_add_overflow(1, pathl, &pathl) || !(new_pathv = reallocarray(gl->gl_pathv, pathl, sizeof(char*))) ) { free(path); result = GLOB_NOSPACE; continue; } gl->gl_pathv = new_pathv; paths_length = old_pathc * 2; } gl->gl_pathv[gl->gl_offs + gl->gl_pathc++] = path; } } // Clean up the segments and free the compiled regular expressions. if ( segments_count && segments[0].dir ) closedir(segments[0].dir); for ( size_t i = 0; i < segments_count; i++ ) if ( !segments[i].trivial ) regfree(&segments[i].regex); free(segments); // Output the input pattern if nothing matched when GLOB_NOCHECK. if ( !result && gl->gl_pathc == initial_pathc ) { if ( (flags & GLOB_NOCHECK) ) { if ( (gl->gl_pathv[gl->gl_offs] = strdup(pattern)) ) gl->gl_pathc = 1; else result = GLOB_NOSPACE; } else result = GLOB_NOMATCH; } // Sort the new entries per LC_COLLATE per POSIX. if ( !(flags & GLOB_NOSORT) ) qsort(gl->gl_pathv + gl->gl_offs + initial_pathc, gl->gl_pathc - initial_pathc, sizeof(char*), strcoll_indirect); gl->gl_pathv[gl->gl_offs + gl->gl_pathc] = NULL; return result; }