Implement UTF-8

This commit is contained in:
Juhani Krekelä 2018-05-25 14:19:09 +03:00
parent 4853e42992
commit eacbc95cea
2 changed files with 122 additions and 14 deletions

View File

@ -24,8 +24,8 @@ TODO
* Optimization pass to turn multiply loops into commands that do `x += y * c`
* Make VM use a Proxied object that gives out 0 for nonexistent elements for
its memory
* Implement UTF-8 I/O
* Keep a cache of compiled programs in `run()`
* Support for other types of EOF?
### gir.html
* Implement a UI

134
gir.js
View File

@ -442,7 +442,7 @@ function optimize(parsed) {
// Virtual machine
// ------------------------------------------------------------------
// ([flatCommandObject]) → girVMState
// ([flatCommandObject], [int]) → girVMState
function newVM(program, input) {
return {
// Initial state for the machine
@ -453,7 +453,7 @@ function newVM(program, input) {
tapeHead: 0,
input: input,
output: ''
output: []
};
}
@ -474,8 +474,9 @@ function runVM(state, maxCycles = null) {
}
let tapeHead = state.tapeHead;
let input = state.input;
let output = state.output;
// Create copies of input and output, since we might modify them
let input = state.input.slice();
let output = state.output.slice();
let complete = false;
let cycle = 0;
@ -526,22 +527,18 @@ function runVM(state, maxCycles = null) {
case writeByte:
if(!(index in memory)) memory[index] = 0;
// TODO: utf-8
output += String.fromCodePoint(memory[index]);
output.push(memory[index]);
ip++;
break;
case readByte:
// TODO: utf-8
// Have we reached EOF?
if(input.length == 0) {
// Yes, return 0
memory[index] = 0;
} else {
// No, return character
memory[index] = input.codePointAt(0);
// FIXME: This only works for BMP
input = input.slice(1);
memory[index] = input.shift();
}
ip++;
break;
@ -589,6 +586,117 @@ function runVM(state, maxCycles = null) {
return {state: newState, complete: complete, cycles: cycle};
}
// ------------------------------------------------------------------
// UTF-8
// ------------------------------------------------------------------
// string → [int]
function encodeUTF8(string) {
let encoded = [];
for(let character of string) {
let codepoint = character.codePointAt(0);
if(codepoint < 0x80) {
// 0xxxxxxx
encoded.push(codepoint);
} else if(codepoint < 0x0800) {
// 110xxxxx 10xxxxxx
let b1 = codepoint >> 6 | 0b11000000;
let b2 = codepoint & 0b00111111 | 0b10000000;
encoded.push(b1);
encoded.push(b2);
} else if(codepoint < 0x10000) {
// 1110xxxx 10xxxxxx 10xxxxxx
let b1 = codepoint >> 12 | 0b11100000;
let b2 = codepoint >> 6 & 0b00111111 | 0b10000000;
let b3 = codepoint & 0b00111111 | 0b10000000;
encoded.push(b1);
encoded.push(b2);
encoded.push(b3);
} else {
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
let b1 = codepoint >> 18 | 0b11110000;
let b2 = codepoint >> 12 & 0b00111111 | 0b10000000;
let b3 = codepoint >> 6 & 0b00111111 | 0b10000000;
let b4 = codepoint & 0b00111111 | 0b10000000;
encoded.push(b1);
encoded.push(b2);
encoded.push(b3);
encoded.push(b4);
}
}
return encoded;
}
// [int] → string
function decodeUTF8(encoded) {
let codePoints = [];
for(let i = 0; i < encoded.length;) {
let codePoint = 0;
let firstByte = encoded[i];
i++;
let toRead = null;
// Determine number of continuation bytes to read and
// decode the first byte into codePoint
// Since we'll do the shifts later, we just mask here
if(firstByte >> 7 == 0) {
// 0xxxxxxx
toRead = 0;
codePoint = firstByte;
} else if(firstByte >> 5 == 0b110) {
// 110xxxxx 10xxxxxx
toRead = 1;
codePoint = firstByte & 0b00011111;
} else if(firstByte >> 4 == 0b1110) {
// 1110xxxx 10xxxxxx 10xxxxxx
toRead = 2;
codePoint = firstByte & 0b00001111;
} else if(firstByte >> 3 == 0b11110) {
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
toRead = 3;
codePoint = firstByte & 0b00000111;
} else {
// Illegal sequence, push replacement char
codePoints.push(0xFFFD);
continue;
}
for(; toRead > 0 && i < encoded.length; toRead--) {
let continuationByte = encoded[i];
i++;
// Check that we have a valid continuation byte
if(continuationByte >> 6 == 0b10) {
// We do, add its contents to codePoint
codePoint = codePoint << 6 |
continuationByte & 0b00111111;
} else {
// We don't, break out of the loop
break;
}
}
// Did we read all required continuation bytes?
if(toRead == 0) {
// We did, add the codepoint to the array
codePoints.push(codePoint);
} else {
// We didn't, push replacement char
codePoints.push(0xFFFD);
}
}
// Convert to a string
let decoded = codePoints.map(x => String.fromCodePoint(x)).join('');
return decoded;
}
// ------------------------------------------------------------------
// User-facing functions
// ------------------------------------------------------------------
@ -598,14 +706,14 @@ function compile(program) {
return optimize(parse(program));
}
// (string, string, bool) → string
// (string, string, int) → string
function run(program, input, maxCycles = null) {
// TODO; Cache programs
let compiled = compile(program);
let vm = newVM(compiled, input);
let vm = newVM(compiled, encodeUTF8(input));
let result = runVM(vm, maxCycles);
let output = result.state.output;
let output = decodeUTF8(result.state.output);
// If didn't complete, mark it in the output
if(!result.complete) {