Reimplement wchar conversion API.

This commit is contained in:
Jonas 'Sortie' Termansen 2014-04-17 17:41:37 +02:00
parent 9e6148f6ff
commit f41964fcab
16 changed files with 463 additions and 303 deletions

View File

@ -238,6 +238,8 @@ time/strftime.o \
time/timegm.o \
wchar/mbrlen.o \
wchar/mbrtowc.o \
wchar/mbsinit.o \
wchar/mbsnrtowcs.o \
wchar/mbsrtowcs.o \
wchar/wcrtomb.o \
wchar/wcscat.o \
@ -251,6 +253,7 @@ wchar/wcslen.o \
wchar/wcsncat.o \
wchar/wcsncmp.o \
wchar/wcsncpy.o \
wchar/wcsnrtombs.o \
wchar/wcspbrk.o \
wchar/wcsrchr.o \
wchar/wcsrtombs.o \

View File

@ -1,6 +1,6 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2013.
Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2013, 2014.
This file is part of the Sortix C Library.
@ -39,8 +39,7 @@ __BEGIN_DECLS
/* TODO: This random interface is stupid. What should a good value be? */
#define RAND_MAX 32767
/* TODO: This is just a value. It's not a compile time constant! */
#define MB_CUR_MAX 16
#define MB_CUR_MAX 6
typedef struct
{

View File

@ -105,12 +105,15 @@ typedef __wint_t wint_t;
/* Conversion state information. */
typedef struct
{
int __count;
union
{
wint_t __wch;
char __wchb[4];
} __value; /* Value so far. */
#if defined(__is_sortix_libc)
unsigned short count;
unsigned short length;
wint_t wch;
#else
unsigned short __count;
unsigned short __length;
wint_t __wch;
#endif
} mbstate_t;
#define __mbstate_t_defined 1
#endif
@ -126,12 +129,11 @@ struct tm;
/* TODO: wint_t getwchar(void); */
size_t mbrlen(const char* __restrict, size_t, mbstate_t* __restrict);
size_t mbrtowc(wchar_t* __restrict, const char* __restrict, size_t, mbstate_t* __restrict);
/* TODO: int mbsinit(const mbstate_t*); */
int mbsinit(const mbstate_t*);
size_t mbsrtowcs(wchar_t* __restrict, const char** __restrict, size_t, mbstate_t* __restrict);
/* TODO: wint_t putwc(wchar_t, FILE*); */
/* TODO: wint_t putwchar(wchar_t); */
/* TODO: wint_t ungetwc(wint_t, FILE*); */
size_t wcrtomb(char* __restrict, wchar_t, mbstate_t* __restrict);
wchar_t* wcscat(wchar_t* __restrict, const wchar_t* __restrict);
wchar_t* wcschr(const wchar_t*, wchar_t);
@ -193,7 +195,7 @@ int wcwidth(wchar_t);
/* Functions from POSIX 2008. */
#if __USE_SORTIX || 200809L <= __USE_POSIX
/* TODO: size_t mbsnrtowcs(wchar_t* __restrict, const char** __restrict, size_t, size_t, mbstate_t* __restrict); */
size_t mbsnrtowcs(wchar_t* __restrict, const char** __restrict, size_t, size_t, mbstate_t* __restrict);
/* TODO: FILE* open_wmemstream(wchar_t**, size_t*); */
/* TODO: wchar_t* wcpcpy(wchar_t* __restrict, const wchar_t* __restrict); */
/* TODO: wchar_t* wcpncpy(wchar_t* __restrict, const wchar_t* __restrict, size_t); */
@ -204,7 +206,7 @@ int wcwidth(wchar_t);
/* TODO: int wcsncasecmp(const wchar_t*, const wchar_t *, size_t); */
/* TODO: int wcsncasecmp_l(const wchar_t*, const wchar_t *, size_t, locale_t); */
/* TODO: size_t wcsnlen(const wchar_t*, size_t); */
/* TODO: size_t wcsnrtombs(char* __restrict, const wchar_t** __restrict, size_t, size_t, mbstate_t* __restrict); */
size_t wcsnrtombs(char* __restrict, const wchar_t** __restrict, size_t, size_t, mbstate_t* __restrict);
/* TODO: size_t wcsxfrm_l(wchar_t* __restrict, const wchar_t* __restrict, size_t, locale_t); */
#endif

View File

@ -1,6 +1,6 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2013.
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
This file is part of the Sortix C Library.
@ -26,18 +26,18 @@
#include <string.h>
#include <wchar.h>
// TODO: This function is unpure and should be removed.
extern "C" int mblen(const char* s, size_t n)
{
wchar_t wc;
static mbstate_t ps;
size_t result = mbrtowc(&wc, s, n, &ps);
if ( !s )
{
memset(&ps, 0, sizeof(ps));
return 0; // TODO: Give the correct return value depending on ps.
}
size_t ret = mbrlen(s, n, &ps);
if ( ret == (size_t) -2 )
if ( result == (size_t) -1 )
return memset(&ps, 0, sizeof(ps)), -1;
// TODO: Should ps be cleared to zero in this case?
if ( result == (size_t) -2 )
return -1;
if ( ret == (size_t) -1 )
return -1;
return (int) ret;
return (int) result;
}

View File

@ -1,6 +1,6 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2013.
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
This file is part of the Sortix C Library.
@ -23,16 +23,14 @@
*******************************************************************************/
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
extern "C" size_t mbstowcs(wchar_t* dst, const char* src, size_t n)
// TODO: This function is unpure and should be removed.
extern "C"
size_t mbstowcs(wchar_t* restrict dst, const char* restrict src, size_t n)
{
// Reset the secret conversion state variable in mbsrtowcs that is used when
// ps is NULL by successfully converting the empty string. As always, this
// is not multithread secure. For some reason, the standards don't mandate
// that the conversion state is reset when mbsrtowcs is called with ps=NULL,
// which arguably is a feature - but this function is supposed to do it.
const char* empty_string = "";
mbsrtowcs(NULL, &empty_string, 0, NULL);
return mbsrtowcs(dst, &src, n, NULL);
mbstate_t ps;
memset(&ps, 0, sizeof(ps));
return mbsrtowcs(dst, (const char**) &src, n, &ps);
}

View File

@ -1,6 +1,6 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2011, 2012.
Copyright(C) Jonas 'Sortie' Termansen 2011, 2012, 2014.
This file is part of the Sortix C Library.
@ -24,10 +24,20 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
// TODO: This function is unpure and should be removed.
extern "C" int mbtowc(wchar_t* pwd, const char* s, size_t n)
extern "C" int mbtowc(wchar_t* pwc, const char* s, size_t n)
{
return mbrtowc(pwd, s, n, NULL);
static mbstate_t ps;
size_t result = mbrtowc(pwc, s, n, &ps);
if ( !s )
memset(&ps, 0, sizeof(ps));
if ( result == (size_t) -1 )
return memset(&ps, 0, sizeof(ps)), -1;
// TODO: Should ps be cleared to zero in this case?
if ( result == (size_t) -2 )
return -1;
return (int) result;
}

View File

@ -1,6 +1,6 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2013.
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
This file is part of the Sortix C Library.
@ -23,16 +23,13 @@
*******************************************************************************/
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
// TODO: This function is unpure and should be removed.
extern "C" size_t wcstombs(char* dst, const wchar_t* src, size_t n)
{
// Reset the secret conversion state variable in wcsrtombs that is used when
// ps is NULL by successfully converting the empty string. As always, this
// is not multithread secure. For some reason, the standards don't mandate
// that the conversion state is reset when wcsrtombs is called with ps=NULL,
// which arguably is a feature - but this function is supposed to do it.
const wchar_t* empty_string = L"";
wcsrtombs(NULL, &empty_string, 0, NULL);
return wcsrtombs(dst, &src, n, NULL);
mbstate_t ps;
memset(&ps, 0, sizeof(ps));
return wcsrtombs(dst, &src, n, &ps);
}

View File

@ -1,6 +1,6 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2012.
Copyright(C) Jonas 'Sortie' Termansen 2012, 2014.
This file is part of the Sortix C Library.
@ -23,10 +23,19 @@
*******************************************************************************/
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
// TODO: This function is unpure and should be removed.
extern "C" int wctomb(char* s, wchar_t wc)
{
return wcrtomb(s, wc, NULL);
static mbstate_t ps;
size_t result = wcrtomb(s, wc, &ps);
if ( !s )
memset(&ps, 0, sizeof(ps));
if ( result == (size_t) -1 )
return -1;
if ( result == (size_t) -2 )
return -1;
return (int) result;
}

View File

@ -1,6 +1,6 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2013.
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
This file is part of the Sortix C Library.
@ -22,64 +22,11 @@
*******************************************************************************/
#include <errno.h>
#include <string.h>
#include <wchar.h>
static size_t utf8_header_length(unsigned char uc)
{
if ( (uc & 0b11000000) == 0b10000000 )
return 0;
if ( (uc & 0b10000000) == 0b00000000 )
return 1;
if ( (uc & 0b11100000) == 0b11000000 )
return 2;
if ( (uc & 0b11110000) == 0b11100000 )
return 3;
if ( (uc & 0b11111000) == 0b11110000 )
return 4;
if ( (uc & 0b11111100) == 0b11111000 )
return 5;
if ( (uc & 0b11111110) == 0b11111100 )
return 6;
return (size_t) -1;
}
// TODO: Use the shift state.
extern "C"
size_t mbrlen(const char* restrict s, size_t n, mbstate_t* restrict ps)
{
size_t expected_length;
for ( size_t i = 0; i < n; i++ )
{
unsigned char uc = (unsigned char) s[i];
if ( i == 0 )
{
if ( !uc )
{
memset(ps, 0, sizeof(*ps));
return 0;
}
if ( (expected_length = utf8_header_length(uc)) == (size_t) -1 )
return errno = EILSEQ, (size_t) -1;
// Check if we encounted an unexpected character claiming to be in
// the middle of a UTF-8 multibyte sequence (10xxxxxx).
if ( expected_length == 0 )
// TODO: Should we play catch up with the partial sequence?
return errno = EILSEQ, (size_t) -1;
}
// All non-header bytes should be of the form 10xxxxxx.
if ( 0 < i && expected_length < n && (uc & 0b11000000) != 0b10000000 )
return errno = EILSEQ, (size_t) -1;
if ( i + 1 == expected_length )
return i + 1;
}
return (size_t) -2;
static mbstate_t static_ps;
return mbrtowc(NULL, s, n, ps ? ps : &static_ps);
}

View File

@ -1,6 +1,6 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2012.
Copyright(C) Jonas 'Sortie' Termansen 2012, 2014.
This file is part of the Sortix C Library.
@ -24,82 +24,123 @@
#include <errno.h>
#include <stdint.h>
#include <string.h>
#include <wchar.h>
extern "C"
size_t mbrtowc(wchar_t* restrict pwc, const char* restrict s, size_t n,
mbstate_t* restrict /*ps*/)
static
size_t utf8_mbrtowc(wchar_t* restrict pwc,
const char* restrict s,
size_t n,
mbstate_t* restrict ps)
{
if ( !s )
size_t i;
for ( i = 0; !(i && ps->count == 0); i++ )
{
// TODO: Restore ps to initial state if currently valid.
return 0;
}
uint8_t* buf = (uint8_t*) s;
wchar_t ret = 0;
size_t numbytes = 0;
size_t sequence_len = 1;
while ( numbytes < sequence_len )
{
if ( numbytes == n )
{
// TODO: Support restore through the mbstate_t!
// Handle the case where we were not able to fully decode a character,
// but it is still possible to finish decoding given more bytes.
if ( n <= i )
return (size_t) -2;
char c = s[i];
unsigned char uc = (unsigned char) c;
// The initial state is that we expect a leading byte that informs us of
// the length of this character sequence. The number of consecutive high
// order bits tells us how many bytes make up this character (one
// leading byte followed by zero or more continuation bytes).
if ( ps->count == 0 )
{
if ( (uc & 0b10000000) == 0b00000000 ) /* 0xxxxxxx */
{
ps->length = (ps->count = 0) + 1;
ps->wch = (wchar_t) uc & 0b1111111;
}
else if ( (uc & 0b11100000) == 0b11000000 ) /* 110xxxxx */
{
ps->length = (ps->count = 1) + 1;
ps->wch = (wchar_t) uc & 0b11111;
}
else if ( (uc & 0b11110000) == 0b11100000 ) /* 1110xxxx */
{
ps->length = (ps->count = 2) + 1;
ps->wch = (wchar_t) uc & 0b1111;
}
else if ( (uc & 0b11111000) == 0b11110000 ) /* 11110xxx */
{
ps->length = (ps->count = 3) + 1;
ps->wch = (wchar_t) uc & 0b111;
}
#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */
else if ( (uc & 0b11111100) == 0b11111000 ) /* 111110xx */
{
ps->length = (ps->count = 4) + 1) + 1;
ps->wch = (wchar_t) uc & 0b11;
}
else if ( (uc & 0b11111110) == 0b11111100 ) /* 1111110x */
{
ps->length = (ps->count = 5) + 1) + 1;
ps->wch = (wchar_t) uc & 0b1;
}
#endif
else
return errno = EILSEQ, (size_t) -1;
}
uint8_t b = buf[numbytes++];
bool is_continuation = b >> (8-2) == 0b10;
if ( 1 == numbytes && is_continuation )
return errno = EILSEQ, (size_t) -1;
if ( 2 <= numbytes && !is_continuation )
return errno = EILSEQ, (size_t) -1;
wchar_t new_bits;
size_t new_bits_num;
if ( b >> (8-1) == 0b0 )
new_bits = b & 0b01111111,
new_bits_num = 7,
sequence_len = 1;
else if ( b >> (8-2) == 0b10 )
new_bits = b & 0b00111111,
new_bits_num = 6,
sequence_len = 2;
else if ( b >> (8-3) == 0b110 )
new_bits = b & 0b00011111,
new_bits_num = 5,
sequence_len = 3;
else if ( b >> (8-4) == 0b1110 )
new_bits = b & 0b00001111,
new_bits_num = 4,
sequence_len = 4;
else if ( b >> (8-5) == 0b11110 )
new_bits = b & 0b00000111,
new_bits_num = 3,
sequence_len = 5;
else if ( b >> (8-6) == 0b111110 )
new_bits = b & 0b00000011,
new_bits_num = 2,
sequence_len = 6;
else if ( b >> (8-7) == 0b1111110 )
new_bits = b & 0b00000001,
new_bits_num = 1,
sequence_len = 7;
// The secondary state is that following a leading byte, we are
// expecting a non-zero number of continuation byte bytes.
else
return errno = EILSEQ, (size_t) -1;
ret = ret >> new_bits_num | new_bits;
{
// Verify this is a continuation byte.
if ( (uc & 0b11000000) != 0b10000000 )
return errno = EILSEQ, (size_t) -1;
ps->wch = ps->wch << 6 | (uc & 0b00111111);
ps->count--;
}
}
if ( !ret )
{
// TODO: Reset ps to initial state.
return 0;
}
if ( (numbytes == 2 && ret <= 0x007F) ||
(numbytes == 3 && ret <= 0x07FF) ||
(numbytes == 4 && ret <= 0xFFFF) ||
(numbytes == 5 && ret <= 0x1FFFFF) ||
(numbytes == 6 && ret <= 0x3FFFFFF) )
// Reject the character if it was produced with an overly long sequence.
if ( ps->length == 1 && 1 << 7 <= ps->wch )
return errno = EILSEQ, (size_t) -1;
if ( ps->length == 2 && 1 << (5 + 1 * 6) <= ps->wch )
return errno = EILSEQ, (size_t) -1;
if ( ps->length == 3 && 1 << (4 + 2 * 6) <= ps->wch )
return errno = EILSEQ, (size_t) -1;
if ( ps->length == 4 && 1 << (3 + 3 * 6) <= ps->wch )
return errno = EILSEQ, (size_t) -1;
#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */
if ( ps->length == 5 && 1 << (2 + 4 * 6) <= ps->wch )
return errno = EILSEQ, (size_t) -1;
if ( ps->length == 6 && 1 << (1 + 5 * 6) <= ps->wch )
return errno = EILSEQ, (size_t) -1;
#endif
// RFC 3629 limits UTF-8 to 0x0 through 0x10FFFF.
if ( 0x10FFFF <= ps->wch )
return errno = EILSEQ, (size_t) -1;
wchar_t result = ps->wch;
if ( pwc )
*pwc = ret;
return numbytes;
*pwc = result;
ps->length = 0;
ps->wch = 0;
return result != L'\0' ? i : 0;
}
extern "C"
size_t mbrtowc(wchar_t* restrict pwc,
const char* restrict s,
size_t n,
mbstate_t* restrict ps)
{
static mbstate_t static_ps;
if ( !ps )
ps = &static_ps;
if ( !s )
s = "", n = 1;
// TODO: Verify whether the current locale is UTF-8.
return utf8_mbrtowc(pwc, s, n, ps);
}

30
libc/wchar/mbsinit.cpp Normal file
View File

@ -0,0 +1,30 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2014.
This file is part of the Sortix C Library.
The Sortix C Library is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
The Sortix C Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
wchar/mbsinit.cpp
Determine conversion object status.
*******************************************************************************/
#include <wchar.h>
extern "C" int mbsinit(const mbstate_t* ps)
{
return !ps || !ps->count;
}

82
libc/wchar/mbsnrtowcs.cpp Normal file
View File

@ -0,0 +1,82 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
This file is part of the Sortix C Library.
The Sortix C Library is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
The Sortix C Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
wchar/mbsnrtowcs.cpp
Convert a multibyte string to a wide-character string.
*******************************************************************************/
#include <assert.h>
#include <wchar.h>
extern "C"
size_t mbsnrtowcs(wchar_t* restrict dst,
const char** restrict src_ptr,
size_t src_len,
size_t dst_len,
mbstate_t* restrict ps)
{
static mbstate_t static_ps;
if ( !ps )
ps = &static_ps;
assert(src_ptr && *src_ptr);
const char* src = *src_ptr;
// Continue to decode wide characters until we have filled the destination
// buffer or if we have exhausted the limit on input multibyte characters.
size_t dst_offset = 0;
size_t src_offset = 0;
while ( (!dst || dst_offset < dst_len) && src_offset < src_len )
{
mbstate_t ps_copy = *ps;
wchar_t wc;
size_t amount = mbrtowc(&wc, src + src_offset, src_len - src_offset, ps);
// Stop in the event a decoding error occured.
if ( amount == (size_t) -1 )
return *src_ptr = src + src_offset, (size_t) -1;
// Stop decoding early in the event we encountered a partial character.
if ( amount == (size_t) -2 )
{
*ps = ps_copy;
break;
}
// Store the decoded wide character in the destination buffer.
if ( dst )
dst[dst_offset] = wc;
// Stop decoding after decoding a null character and return a NULL
// source pointer to the caller, not including the null character in the
// number of characters stored in the destination buffer.
if ( wc == L'\0' )
{
src = NULL;
src_offset = 0;
break;
}
dst_offset++;
src_offset += amount;
}
return *src_ptr = src + src_offset, dst_offset;
}

View File

@ -1,6 +1,6 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2013.
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
This file is part of the Sortix C Library.
@ -24,49 +24,16 @@
#include <assert.h>
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
extern "C" size_t mbsrtowcs(wchar_t* dst, const char** src_ptr, size_t dst_len,
mbstate_t* ps)
extern "C"
size_t mbsrtowcs(wchar_t* restrict dst,
const char** restrict src_ptr,
size_t dst_len,
mbstate_t* restrict ps)
{
assert(src_ptr && *src_ptr);
// Avoid changing *src_ptr if dst is NULL.
const char* local_src_ptr = *src_ptr;
if ( !dst )
src_ptr = &local_src_ptr;
// For some reason, the standards don't mandate that the secret ps variable
// is reset when ps is NULL, unlike mbstowcs that always resets this
// variable. We'll avoid resetting the variable here in case any programs
// actually take advantage of this fact.
static mbstate_t static_ps;
if ( !ps )
ps = &static_ps;
size_t ret = 0;
size_t src_len = strlen(*src_ptr);
while ( !dst || dst_len )
{
mbstate_t saved_ps = *ps;
size_t consumed = mbrtowc(dst, *src_ptr, src_len, ps);
if ( consumed == (size_t) 0 )
{
*src_ptr = NULL;
break;
}
if ( consumed == (size_t) -1 )
return (size_t) -1;
if ( consumed == (size_t) -2 )
{
*ps = saved_ps;
break;
}
*src_ptr += consumed;
src_len -= consumed;
if ( dst )
dst++,
dst_len--;
ret++;
}
return ret;
return mbsnrtowcs(dst, src_ptr, SIZE_MAX, dst_len, ps);
}

View File

@ -1,6 +1,6 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2012.
Copyright(C) Jonas 'Sortie' Termansen 2012, 2014.
This file is part of the Sortix C Library.
@ -23,58 +23,87 @@
*******************************************************************************/
#include <errno.h>
#include <stdint.h>
#include <stdlib.h>
#include <wchar.h>
extern "C"
size_t wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict /*ps*/)
static
size_t utf8_wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict /*ps*/)
{
if ( !wc )
// The definition of UTF-8 prohibits encoding character numbers between
// U+D800 and U+DFFF, which are reserved for use with the UTF-16 encoding
// form (as surrogate pairs) and do not directly represent characters.
if ( 0xD800 <= wc && wc <= 0xDFFF )
return errno = EILSEQ, (size_t) -1;
// RFC 3629 limits UTF-8 to 0x0 through 0x10FFFF.
if ( 0x10FFFF <= wc )
return errno = EILSEQ, (size_t) -1;
size_t index = 0;
if ( wc < (1 << (7)) ) /* 0xxxxxxx */
{
if ( s )
*s = '\0';
return 1;
s[index++] = 0b00000000 | (wc >> 0 & 0b01111111);
return index;
}
uint32_t unicode = wc;
uint8_t* buf = (uint8_t*) s;
unsigned bytes = 1;
unsigned bits = 7;
if ( (1U<<7U) <= unicode ) { bytes = 2; bits = 11; }
if ( (1U<<11U) <= unicode ) { bytes = 3; bits = 16; }
if ( (1U<<16U) <= unicode ) { bytes = 4; bits = 21; }
if ( (1U<<21U) <= unicode ) { bytes = 5; bits = 26; }
if ( (1U<<26U) <= unicode ) { bytes = 6; bits = 31; }
if ( (1U<<31U) <= unicode ) { errno = EILSEQ; return (size_t) -1; }
if ( !s )
return bytes;
uint8_t prefix;
unsigned prefixavai;
switch ( bytes )
if ( wc < (1 << (5 + 1 * 6)) ) /* 110xxxxx 10xxxxxx^1 */
{
case 1: prefixavai = 7; prefix = 0b0U << prefixavai; break;
case 2: prefixavai = 5; prefix = 0b110U << prefixavai; break;
case 3: prefixavai = 4; prefix = 0b1110U << prefixavai; break;
case 4: prefixavai = 3; prefix = 0b11110U << prefixavai; break;
case 5: prefixavai = 2; prefix = 0b111110U << prefixavai; break;
case 6: prefixavai = 1; prefix = 0b1111110U << prefixavai; break;
default: __builtin_unreachable();
s[index++] = 0b11000000 | (wc >> 6 & 0b00011111);
s[index++] = 0b10000000 | (wc >> 0 & 0b00111111);
return index;
}
// Put the first bits in the unused area of the prefix.
prefix |= unicode >> (bits - prefixavai);
*buf++ = prefix;
unsigned bitsleft = bits - prefixavai;
while ( bitsleft )
if ( wc < (1 << (4 + 2 * 6)) ) /* 1110xxxx 10xxxxxx^2 */
{
bitsleft -= 6;
uint8_t elembits = (unicode>>bitsleft) & ((1U<<6U)-1U);
uint8_t elem = (0b10U<<6U) | elembits;
*buf++ = elem;
s[index++] = 0b11100000 | (wc >> 2*6 & 0b00001111);
s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
return index;
}
return bytes;
if ( wc < (1 << (3 + 3 * 6)) ) /* 11110xxx 10xxxxxx^3 */
{
s[index++] = 0b11110000 | (wc >> 3*6 & 0b00000111);
s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111);
s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
return index;
}
#if 0 /* 5-byte and 6-byte sequences are forbidden by RFC 3629 */
if ( wc < (1 << (2 + 4 * 6)) ) /* 111110xx 10xxxxxx^4 */
{
s[index++] = 0b11111000 | (wc >> 4*6 & 0b00000011);
s[index++] = 0b10000000 | (wc >> 3*6 & 0b00111111);
s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111);
s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
return index;
}
if ( wc < (1 << (1 + 5 * 6)) ) /* 111110xx 10xxxxxx^5 */
{
s[index++] = 0b11111100 | (wc >> 5*6 & 0b00000001);
s[index++] = 0b10000000 | (wc >> 4*6 & 0b00111111);
s[index++] = 0b10000000 | (wc >> 3*6 & 0b00111111);
s[index++] = 0b10000000 | (wc >> 2*6 & 0b00111111);
s[index++] = 0b10000000 | (wc >> 1*6 & 0b00111111);
s[index++] = 0b10000000 | (wc >> 0*6 & 0b00111111);
return index;
}
#endif
return errno = EILSEQ; return (size_t) -1;
}
extern "C"
size_t wcrtomb(char* restrict s, wchar_t wc, mbstate_t* restrict ps)
{
char internal_buffer[MB_CUR_MAX];
if ( !s )
wc = L'\0', s = internal_buffer;
// TODO: Verify whether the current locale is UTF-8.
return utf8_wcrtomb(s, wc, ps);
}

87
libc/wchar/wcsnrtombs.cpp Normal file
View File

@ -0,0 +1,87 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
This file is part of the Sortix C Library.
The Sortix C Library is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or (at your
option) any later version.
The Sortix C Library is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General Public License
along with the Sortix C Library. If not, see <http://www.gnu.org/licenses/>.
wchar/wcsnrtombs.cpp
Convert a wide-character string to multibyte string.
*******************************************************************************/
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
extern "C"
size_t wcsnrtombs(char* restrict dst,
const wchar_t** restrict src_ptr,
size_t src_len,
size_t dst_len,
mbstate_t* restrict ps)
{
static mbstate_t static_ps;
if ( !ps )
ps = &static_ps;
assert(src_ptr && *src_ptr);
const wchar_t* src = *src_ptr;
// Continue to encode multibyte characters until we have filled the
// destination buffer or if we have exhausted the limit on input wide chars.
size_t dst_offset = 0;
size_t src_offset = 0;
while ( (!dst || dst_offset < dst_len) && src_offset < src_len )
{
mbstate_t ps_copy = *ps;
wchar_t wc = src[src_offset];
char mb[MB_CUR_MAX];
size_t amount = wcrtomb(mb, wc, ps);
// Stop in the event a decoding error occured.
if ( amount == (size_t) -1 )
return *src_ptr = src + src_offset, (size_t) -1;
// Stop decoding early in the event we encountered a partial character,
// or that we ran out of space in the destination buffer.
if ( amount == (size_t) -2 || (dst && dst_offset - dst_len < amount ) )
{
*ps = ps_copy;
break;
}
// Store the decoded multibyte character in the destination buffer.
if ( dst )
memcpy(dst + dst_offset, mb, amount);
// Stop decoding after decoding a null character and return a NULL
// source pointer to the caller, not including the null character in the
// number of characters stored in the destination buffer.
if ( wc == L'\0' )
{
src = NULL;
src_offset = 0;
break;
}
dst_offset += amount;
src_offset++;
}
return *src_ptr = src + src_offset, dst_offset;
}

View File

@ -1,6 +1,6 @@
/*******************************************************************************
Copyright(C) Jonas 'Sortie' Termansen 2013.
Copyright(C) Jonas 'Sortie' Termansen 2013, 2014.
This file is part of the Sortix C Library.
@ -22,55 +22,14 @@
*******************************************************************************/
#include <assert.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <wchar.h>
extern "C" size_t wcsrtombs(char* dst, const wchar_t** src_ptr, size_t dst_len,
mbstate_t* ps)
extern "C"
size_t wcsrtombs(char* restrict dst,
const wchar_t** restrict src_ptr,
size_t dst_len,
mbstate_t* ps)
{
assert(src_ptr && *src_ptr);
// Avoid changing *src_ptr if dst is NULL.
const wchar_t* local_src_ptr = *src_ptr;
if ( !dst )
src_ptr = &local_src_ptr;
// For some reason, the standards don't mandate that the secret ps variable
// is reset when ps is NULL, unlike mbstowcs that always resets this
// variable. We'll avoid resetting the variable here in case any programs
// actually take advantage of this fact.
static mbstate_t static_ps;
if ( !ps )
ps = &static_ps;
size_t ret = 0;
size_t src_len = wcslen(*src_ptr);
char buf[MB_CUR_MAX];
while ( !dst || dst_len )
{
mbstate_t saved_ps = *ps;
size_t produced = wcrtomb(buf, **src_ptr, ps);
if ( produced == (size_t) -1 )
return (size_t) -1;
if ( dst && dst_len < produced )
{
*ps = saved_ps;
break;
}
memcpy(dst, buf, produced);
if ( **src_ptr == L'\0' )
{
ret += produced - 1; // Don't count the '\0' byte.
*src_ptr = NULL;
break;
}
ret += produced;
(*src_ptr)++;
src_len--;
if ( dst )
dst += produced,
dst_len -= produced;
ret++;
}
return ret;
return wcsnrtombs(dst, src_ptr, SIZE_MAX, dst_len, ps);
}