From bf66d5bb76759f9e804031a4cb449acecfa50751 Mon Sep 17 00:00:00 2001 From: Jonas 'Sortie' Termansen Date: Mon, 22 Apr 2013 10:06:44 +0200 Subject: [PATCH] Add mbrlen(3). --- libc/Makefile | 3 +- libc/include/wchar.h | 2 +- libc/mbrlen.cpp | 85 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 libc/mbrlen.cpp diff --git a/libc/Makefile b/libc/Makefile index d17eef32..1267515a 100644 --- a/libc/Makefile +++ b/libc/Makefile @@ -78,6 +78,7 @@ ldiv.o \ lldiv.o \ localtime.o \ localtime_r.o \ +mbrlen.o \ mbrtowc.o \ mbsrtowcs.o \ mbstowcs.o \ @@ -94,8 +95,8 @@ setbuf.o \ setvbuf.o \ sigaddset.o \ sigdelset.o \ -sigfillset.o \ sigemptyset.o \ +sigfillset.o \ sigismember.o \ sort.o \ sprint.o \ diff --git a/libc/include/wchar.h b/libc/include/wchar.h index 7c86d0c6..e36b6bf3 100644 --- a/libc/include/wchar.h +++ b/libc/include/wchar.h @@ -63,6 +63,7 @@ struct tm; size_t mbsrtowcs(wchar_t* __restrict, const char** __restrict, size_t, mbstate_t* __restrict); size_t wcrtomb(char* __restrict, wchar_t, mbstate_t* __restrict); +size_t mbrlen(const char* __restrict, size_t, mbstate_t* __restrict); size_t mbrtowc(wchar_t* __restrict, const char* __restrict, size_t, mbstate_t* __restrict); wchar_t* wcscat(wchar_t* __restrict, const wchar_t* __restrict); wchar_t* wcschr(const wchar_t*, wchar_t); @@ -107,7 +108,6 @@ int wscanf(const wchar_t* __restrict, ...); long double wcstold(const wchar_t* __restrict, wchar_t** __restrict); long long wcstoll(const wchar_t* __restrict, wchar_t** __restrict, int); long wcstol(const wchar_t* __restrict, wchar_t** __restrict, int); -size_t mbrlen(const char* __restrict, size_t, mbstate_t* __restrict); size_t wcsftime(wchar_t* __restrict, size_t, const wchar_t* __restrict, const struct tm* __restrict); size_t wcsxfrm(wchar_t* __restrict, const wchar_t* __restrict, size_t); unsigned long long wcstoull(const wchar_t* __restrict, wchar_t** __restrict, int); diff --git a/libc/mbrlen.cpp b/libc/mbrlen.cpp new file mode 100644 index 00000000..aa4601ac --- /dev/null +++ b/libc/mbrlen.cpp @@ -0,0 +1,85 @@ +/******************************************************************************* + + Copyright(C) Jonas 'Sortie' Termansen 2013. + + This file is part of the Sortix C Library. + + The Sortix C Library is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or (at your + option) any later version. + + The Sortix C Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the Sortix C Library. If not, see . + + mbrlen.cpp + Determine number of bytes in next multibyte character. + +*******************************************************************************/ + +#include +#include +#include + +static size_t utf8_header_length(unsigned char uc) +{ + if ( (uc & 0b11000000) == 0b10000000 ) + return 0; + if ( (uc & 0b10000000) == 0b00000000 ) + return 1; + if ( (uc & 0b11100000) == 0b11000000 ) + return 2; + if ( (uc & 0b11110000) == 0b11100000 ) + return 3; + if ( (uc & 0b11111000) == 0b11110000 ) + return 4; + if ( (uc & 0b11111100) == 0b11111000 ) + return 5; + if ( (uc & 0b11111110) == 0b11111100 ) + return 6; + return (size_t) -1; +} + +// TODO: Use the shift state. +extern "C" +size_t mbrlen(const char* restrict s, size_t n, mbstate_t* restrict ps) +{ + size_t expected_length; + + for ( size_t i = 0; i < n; i++ ) + { + unsigned char uc = (unsigned char) s[i]; + + if ( i == 0 ) + { + if ( !uc ) + { + memset(ps, 0, sizeof(*ps)); + return 0; + } + + if ( (expected_length = utf8_header_length(uc)) == (size_t) -1 ) + return errno = EILSEQ, (size_t) -1; + + // Check if we encounted an unexpected character claiming to be in + // the middle of a UTF-8 multibyte sequence (10xxxxxx). + if ( expected_length == 0 ) + // TODO: Should we play catch up with the partial sequence? + return errno = EILSEQ, (size_t) -1; + } + + // All non-header bytes should be of the form 10xxxxxx. + if ( 0 < i && expected_length < n && (uc & 0b11000000) != 0b10000000 ) + return errno = EILSEQ, (size_t) -1; + + if ( i + 1 == expected_length ) + return i + 1; + } + + return (size_t) -2; +}