sortix-mirror/utils/sort.c

457 lines
14 KiB
C

/*
* Copyright (c) 2014, 2015 Jonas 'Sortie' Termansen.
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*
* sort.c
* Sort, merge, or sequence check text files.
*/
#include <errno.h>
#include <error.h>
#include <locale.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// TODO: Implement all the features mandated by POSIX.
// TODO: Implement the useful GNU extensions.
int flip_comparison(int rel)
{
return rel < 0 ? 1 : 0 < rel ? -1 : 0;
}
int indirect_compare(int (*compare)(const char*, const char*),
const void* a_ptr, const void* b_ptr)
{
const char* a = *(const char* const*) a_ptr;
const char* b = *(const char* const*) b_ptr;
return compare(a, b);
}
int compare_line(const char* a, const char* b)
{
return strcoll(a, b);
}
int indirect_compare_line(const void* a_ptr, const void* b_ptr)
{
return indirect_compare(compare_line, a_ptr, b_ptr);
}
int compare_line_reverse(const char* a, const char* b)
{
return flip_comparison(compare_line(a, b));
}
int indirect_compare_line_reverse(const void* a_ptr, const void* b_ptr)
{
return indirect_compare(compare_line_reverse, a_ptr, b_ptr);
}
int compare_version(const char* a, const char* b)
{
return strverscmp(a, b);
}
int indirect_compare_version(const void* a_ptr, const void* b_ptr)
{
return indirect_compare(compare_version, a_ptr, b_ptr);
}
int compare_version_reverse(const char* a, const char* b)
{
return flip_comparison(compare_version(a, b));
}
int indirect_compare_version_reverse(const void* a_ptr, const void* b_ptr)
{
return indirect_compare(compare_version_reverse, a_ptr, b_ptr);
}
struct input_stream
{
const char* const* files;
size_t files_current;
size_t files_length;
FILE* current_file;
const char* last_file_path;
uintmax_t last_line_number;
bool result_status;
};
char* read_line(FILE* fp, const char* fpname, int delim)
{
char* line = NULL;
size_t line_size = 0;
ssize_t amount = getdelim(&line, &line_size, delim, fp);
if ( amount < 0 )
{
free(line);
if ( ferror(fp) )
error(0, errno, "read: `%s'", fpname);
return NULL;
}
if ( (unsigned char) line[amount-1] == (unsigned char) delim )
line[amount-1] = '\0';
return line;
}
char* read_input_stream_line(struct input_stream* is, int delim)
{
if ( !is->files_length )
{
char* result = read_line(stdin, "<stdin>", delim);
if ( ferror(stdin) )
is->result_status = false;
is->last_file_path = "-";
is->last_line_number++;
return result;
}
while ( is->files_current < is->files_length )
{
const char* path = is->files[is->files_current];
if ( !is->current_file )
{
is->last_line_number = 0;
if ( !strcmp(path, "-") )
is->current_file = stdin;
else if ( !(is->current_file = fopen(path, "r")) )
{
error(0, errno, "`%s'", path);
is->result_status = false;
is->files_current++;
continue;
}
}
char* result = read_line(is->current_file, path, delim);
if ( !result )
{
if ( ferror(is->current_file) )
{
error(0, errno, "reading: `%s'", path);
is->result_status = false;
}
if ( is->current_file != stdin )
fclose(is->current_file);
is->current_file = NULL;
is->files_current++;
continue;
}
is->last_file_path = path;
is->last_line_number++;
return result;
}
return NULL;
}
char** read_input_stream_lines(size_t* result_num_lines,
struct input_stream* is,
int delim)
{
char** lines = NULL;
size_t lines_used = 0;
size_t lines_length = 0;
char* line;
while ( (line = read_input_stream_line(is, delim)) )
{
if ( lines_used == lines_length )
{
size_t new_lines_length = lines_length ? 2 * lines_length : 64;
size_t new_lines_size = sizeof(char*) * new_lines_length;
char** new_lines = (char**) realloc(lines, new_lines_size);
if ( !new_lines )
{
error(0, errno, "realloc");
free(line);
is->result_status = false;
return *result_num_lines = lines_used, lines;
}
lines = new_lines;
lines_length = new_lines_length;
}
lines[lines_used++] = line;
}
return *result_num_lines = lines_used, lines;
}
static void compact_arguments(int* argc, char*** argv)
{
for ( int i = 0; i < *argc; i++ )
{
while ( i < *argc && !(*argv)[i] )
{
for ( int n = i; n < *argc; n++ )
(*argv)[n] = (*argv)[n+1];
(*argc)--;
}
}
}
static void help(FILE* fp, const char* argv0)
{
fprintf(fp, "Usage: %s [OPTION]... [FILE]...\n", argv0);
fprintf(fp, "Write sorted concatenation of all FILE(s) to standard output.\n");
fprintf(fp, "\n");
fprintf(fp, "Mandatory arguments to long options are mandatory for short options too.\n");
fprintf(fp, "Ordering options:\n");
fprintf(fp, "\n");
#if 0
fprintf(fp, " -b, --ignore-leading-blanks ignore leading blanks\n");
fprintf(fp, " -d, --dictionary-order consider only blanks and alphanumeric characters\n");
fprintf(fp, " -f, --ignore-case fold lower case to upper case characters\n");
fprintf(fp, " -g, --general-numeric-sort compare according to general numerical value\n");
fprintf(fp, " -i, --ignore-nonprinting consider only printable characters\n");
fprintf(fp, " -M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n");
fprintf(fp, " -h, --human-numeric-sort compare human readable numbers (e.g., 2K 1G)\n");
fprintf(fp, " -n, --numeric-sort compare according to string numerical value\n");
fprintf(fp, " -R, --random-sort sort by random hash of keys\n");
fprintf(fp, " --random-source=FILE get random bytes from FILE\n");
#endif
fprintf(fp, " -r, --reverse reverse the result of comparisons\n");
#if 0
fprintf(fp, " --sort=WORD sort according to WORD:\n");
fprintf(fp, " general-numeric -g, human-numeric -h, month -M,\n");
fprintf(fp, " numeric -n, random -R, version -V\n");
#endif
fprintf(fp, " -V, --version-sort natural sort of (version) numbers within text\n");
fprintf(fp, "\n");
fprintf(fp, "Other options:\n");
fprintf(fp, "\n");
#if 0
fprintf(fp, " --batch-size=NMERGE merge at most NMERGE inputs at once;\n");
fprintf(fp, " for more use temp files\n");
#endif
fprintf(fp, " -c, --check, --check=diagnose-first check for sorted input; do not sort\n");
fprintf(fp, " -C, --check=quiet, --check=silent like -c, but do not report first bad line\n");
#if 0
fprintf(fp, " --compress-program=PROG compress temporaries with PROG;\n");
fprintf(fp, " decompress them with PROG -d\n");
fprintf(fp, " --debug annotate the part of the line used to sort,\n");
fprintf(fp, " and warn about questionable usage to stderr\n");
fprintf(fp, " --files0-from=F read input from the files specified by\n");
fprintf(fp, " NUL-terminated names in file F;\n");
fprintf(fp, " If F is - then read names from standard input\n");
fprintf(fp, " -k, --key=POS1[,POS2] start a key at POS1 (origin 1), end it at POS2\n");
fprintf(fp, " (default end of line). See POS syntax below\n");
#endif
fprintf(fp, " -m, --merge merge already sorted files; do not sort\n");
fprintf(fp, " -o, --output=FILE write result to FILE instead of standard output\n");
#if 0
fprintf(fp, " -s, --stable stabilize sort by disabling last-resort comparison\n");
fprintf(fp, " -S, --buffer-size=SIZE use SIZE for main memory buffer\n");
fprintf(fp, " -t, --field-separator=SEP use SEP instead of non-blank to blank transition\n");
fprintf(fp, " -T, --temporary-directory=DIR use DIR for temporaries, not $TMPDIR or /tmp;\n");
fprintf(fp, " multiple options specify multiple directories\n");
fprintf(fp, " --parallel=N change the number of sorts run concurrently to N\n");
#endif
fprintf(fp, " -u, --unique with -c, check for strict ordering;\n");
fprintf(fp, " without -c, output only the first of an equal run\n");
fprintf(fp, " -z, --zero-terminated end lines with 0 byte, not newline\n");
fprintf(fp, " --help display this help and exit\n");
fprintf(fp, " --version output version information and exit\n");
fprintf(fp, "\n");
#if 0
fprintf(fp, "POS is F[.C][OPTS], where F is the field number and C the character position\n");
fprintf(fp, "in the field; both are origin 1. If neither -t nor -b is in effect, characters\n");
fprintf(fp, "in a field are counted from the beginning of the preceding whitespace. OPTS is\n");
fprintf(fp, "one or more single-letter ordering options, which override global ordering\n");
fprintf(fp, "options for that key. If no key is given, use the entire line as the key.\n");
fprintf(fp, "\n");
fprintf(fp, "SIZE may be followed by the following multiplicative suffixes:\n");
fprintf(fp, "%% 1%% of memory, b 1, K 1024 (default), and so on for M, G, T, P, E, Z, Y.\n");
fprintf(fp, "\n");
#endif
fprintf(fp, "With no FILE, or when FILE is -, read standard input.\n");
fprintf(fp, "\n");
fprintf(fp, "*** WARNING ***\n");
fprintf(fp, "The locale specified by the environment affects sort order.\n");
fprintf(fp, "Set LC_ALL=C to get the traditional sort order that uses\n");
fprintf(fp, "native byte values.\n");
}
static void version(FILE* fp, const char* argv0)
{
fprintf(fp, "%s (Sortix) %s\n", argv0, VERSIONSTR);
}
int main(int argc, char* argv[])
{
setlocale(LC_ALL, "");
bool check = false;
bool check_quiet = false;
bool merge = false;
const char* output = NULL;
bool reverse = false;
bool unique = false;
bool version_sort = false;
bool zero_terminated = false;
const char* argv0 = argv[0];
for ( int i = 1; i < argc; i++ )
{
const char* arg = argv[i];
if ( arg[0] != '-' || !arg[1] )
continue;
argv[i] = NULL;
if ( !strcmp(arg, "--") )
break;
if ( arg[1] != '-' )
{
char c;
while ( (c = *++arg) ) switch ( c )
{
case 'c': check = true; break;
case 'C': check = check_quiet = false; break;
case 'm': merge = true; break;
case 'o':
if ( !*(output = arg + 1) )
{
if ( i + 1 == argc )
{
error(0, 0, "option requires an argument -- 'o'");
fprintf(stderr, "Try `%s --help' for more information.\n", argv[0]);
exit(125);
}
output = argv[i+1];
argv[++i] = NULL;
}
arg = "o";
break;
case 'r': reverse = true; break;
case 'u': unique = true; break;
case 'V': version_sort = true; break;
case 'z': zero_terminated = true; break;
default:
fprintf(stderr, "%s: unknown option -- '%c'\n", argv0, c);
help(stderr, argv0);
exit(1);
}
}
else if ( !strcmp(arg, "--help") )
help(stdout, argv0), exit(0);
else if ( !strcmp(arg, "--version") )
version(stdout, argv0), exit(0);
else if ( !strcmp(arg, "--check") ||
!strcmp(arg, "--check=diagnose-first") )
check = true, check_quiet = false;
else if ( !strcmp(arg, "--check=quiet") ||
!strcmp(arg, "--check=silent") )
check = true, check_quiet = true;
else if ( !strcmp(arg, "--merge") )
merge = true;
else if ( !strncmp(arg, "--output=", strlen("--output=")) )
output = arg + strlen("--output=");
else if ( !strcmp(arg, "--output") )
{
if ( i + 1 == argc )
{
error(0, 0, "option '--output' requires an argument");
fprintf(stderr, "Try `%s --help' for more information.\n", argv[0]);
exit(125);
}
output = argv[i+1];
argv[++i] = NULL;
}
else if ( !strcmp(arg, "--reverse") )
reverse = true;
else if ( !strcmp(arg, "--unique") )
unique = true;
else if ( !strcmp(arg, "--version-sort") )
version_sort = true;
else if ( !strcmp(arg, "--zero-terminated") )
zero_terminated = true;
else
{
fprintf(stderr, "%s: unknown option: %s\n", argv0, arg);
help(stderr, argv0);
exit(1);
}
}
compact_arguments(&argc, &argv);
if ( output && !freopen(output, "w", stdout) )
error(2, errno, "`%s'", output);
int delim = zero_terminated ? '\0' : '\n';
int (*compare)(const char*, const char*);
int (*qsort_compare)(const void*, const void*);
if ( version_sort && reverse )
compare = compare_version_reverse,
qsort_compare = indirect_compare_version_reverse;
else if ( version_sort )
compare = compare_version,
qsort_compare = indirect_compare_version;
else if ( reverse )
compare = compare_line_reverse,
qsort_compare = indirect_compare_line_reverse;
else
compare = compare_line,
qsort_compare = indirect_compare_line;
struct input_stream is;
memset(&is, 0, sizeof(is));
is.files = (const char* const*) (argv + 1);
is.files_current = 0;
is.files_length = argc - 1;
is.result_status = true;
if ( check )
{
int needed_relation = unique ? 1 : 0;
char* prev_line = NULL;
char* line;
while ( (line = read_input_stream_line(&is, delim)) )
{
if ( prev_line && compare(line, prev_line) < needed_relation )
{
if ( !check_quiet )
error(0, errno, "%s:%ju: disorder: %s", is.last_file_path,
is.last_line_number, line);
exit(1);
}
free(prev_line);
prev_line = line;
}
free(prev_line);
}
else
{
(void) merge;
size_t lines_used = 0;
char** lines = read_input_stream_lines(&lines_used, &is, delim);
qsort(lines, lines_used, sizeof(*lines), qsort_compare);
for ( size_t i = 0; i < lines_used; i++ )
{
if ( unique && i && compare(lines[i-1], lines[i]) == 0 )
continue;
fputs(lines[i], stdout);
fputc(delim, stdout);
}
}
return is.result_status ? 0 : 2;
}