Implement UTF-8

This commit is contained in:
Juhani Krekelä 2018-05-25 14:19:09 +03:00
parent 4853e42992
commit eacbc95cea
2 changed files with 122 additions and 14 deletions

View File

@ -24,8 +24,8 @@ TODO
* Optimization pass to turn multiply loops into commands that do `x += y * c` * Optimization pass to turn multiply loops into commands that do `x += y * c`
* Make VM use a Proxied object that gives out 0 for nonexistent elements for * Make VM use a Proxied object that gives out 0 for nonexistent elements for
its memory its memory
* Implement UTF-8 I/O
* Keep a cache of compiled programs in `run()` * Keep a cache of compiled programs in `run()`
* Support for other types of EOF?
### gir.html ### gir.html
* Implement a UI * Implement a UI

134
gir.js
View File

@ -442,7 +442,7 @@ function optimize(parsed) {
// Virtual machine // Virtual machine
// ------------------------------------------------------------------ // ------------------------------------------------------------------
// ([flatCommandObject]) → girVMState // ([flatCommandObject], [int]) → girVMState
function newVM(program, input) { function newVM(program, input) {
return { return {
// Initial state for the machine // Initial state for the machine
@ -453,7 +453,7 @@ function newVM(program, input) {
tapeHead: 0, tapeHead: 0,
input: input, input: input,
output: '' output: []
}; };
} }
@ -474,8 +474,9 @@ function runVM(state, maxCycles = null) {
} }
let tapeHead = state.tapeHead; let tapeHead = state.tapeHead;
let input = state.input; // Create copies of input and output, since we might modify them
let output = state.output; let input = state.input.slice();
let output = state.output.slice();
let complete = false; let complete = false;
let cycle = 0; let cycle = 0;
@ -526,22 +527,18 @@ function runVM(state, maxCycles = null) {
case writeByte: case writeByte:
if(!(index in memory)) memory[index] = 0; if(!(index in memory)) memory[index] = 0;
// TODO: utf-8 output.push(memory[index]);
output += String.fromCodePoint(memory[index]);
ip++; ip++;
break; break;
case readByte: case readByte:
// TODO: utf-8
// Have we reached EOF? // Have we reached EOF?
if(input.length == 0) { if(input.length == 0) {
// Yes, return 0 // Yes, return 0
memory[index] = 0; memory[index] = 0;
} else { } else {
// No, return character // No, return character
memory[index] = input.codePointAt(0); memory[index] = input.shift();
// FIXME: This only works for BMP
input = input.slice(1);
} }
ip++; ip++;
break; break;
@ -589,6 +586,117 @@ function runVM(state, maxCycles = null) {
return {state: newState, complete: complete, cycles: cycle}; return {state: newState, complete: complete, cycles: cycle};
} }
// ------------------------------------------------------------------
// UTF-8
// ------------------------------------------------------------------
// string → [int]
function encodeUTF8(string) {
let encoded = [];
for(let character of string) {
let codepoint = character.codePointAt(0);
if(codepoint < 0x80) {
// 0xxxxxxx
encoded.push(codepoint);
} else if(codepoint < 0x0800) {
// 110xxxxx 10xxxxxx
let b1 = codepoint >> 6 | 0b11000000;
let b2 = codepoint & 0b00111111 | 0b10000000;
encoded.push(b1);
encoded.push(b2);
} else if(codepoint < 0x10000) {
// 1110xxxx 10xxxxxx 10xxxxxx
let b1 = codepoint >> 12 | 0b11100000;
let b2 = codepoint >> 6 & 0b00111111 | 0b10000000;
let b3 = codepoint & 0b00111111 | 0b10000000;
encoded.push(b1);
encoded.push(b2);
encoded.push(b3);
} else {
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
let b1 = codepoint >> 18 | 0b11110000;
let b2 = codepoint >> 12 & 0b00111111 | 0b10000000;
let b3 = codepoint >> 6 & 0b00111111 | 0b10000000;
let b4 = codepoint & 0b00111111 | 0b10000000;
encoded.push(b1);
encoded.push(b2);
encoded.push(b3);
encoded.push(b4);
}
}
return encoded;
}
// [int] → string
function decodeUTF8(encoded) {
let codePoints = [];
for(let i = 0; i < encoded.length;) {
let codePoint = 0;
let firstByte = encoded[i];
i++;
let toRead = null;
// Determine number of continuation bytes to read and
// decode the first byte into codePoint
// Since we'll do the shifts later, we just mask here
if(firstByte >> 7 == 0) {
// 0xxxxxxx
toRead = 0;
codePoint = firstByte;
} else if(firstByte >> 5 == 0b110) {
// 110xxxxx 10xxxxxx
toRead = 1;
codePoint = firstByte & 0b00011111;
} else if(firstByte >> 4 == 0b1110) {
// 1110xxxx 10xxxxxx 10xxxxxx
toRead = 2;
codePoint = firstByte & 0b00001111;
} else if(firstByte >> 3 == 0b11110) {
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
toRead = 3;
codePoint = firstByte & 0b00000111;
} else {
// Illegal sequence, push replacement char
codePoints.push(0xFFFD);
continue;
}
for(; toRead > 0 && i < encoded.length; toRead--) {
let continuationByte = encoded[i];
i++;
// Check that we have a valid continuation byte
if(continuationByte >> 6 == 0b10) {
// We do, add its contents to codePoint
codePoint = codePoint << 6 |
continuationByte & 0b00111111;
} else {
// We don't, break out of the loop
break;
}
}
// Did we read all required continuation bytes?
if(toRead == 0) {
// We did, add the codepoint to the array
codePoints.push(codePoint);
} else {
// We didn't, push replacement char
codePoints.push(0xFFFD);
}
}
// Convert to a string
let decoded = codePoints.map(x => String.fromCodePoint(x)).join('');
return decoded;
}
// ------------------------------------------------------------------ // ------------------------------------------------------------------
// User-facing functions // User-facing functions
// ------------------------------------------------------------------ // ------------------------------------------------------------------
@ -598,14 +706,14 @@ function compile(program) {
return optimize(parse(program)); return optimize(parse(program));
} }
// (string, string, bool) → string // (string, string, int) → string
function run(program, input, maxCycles = null) { function run(program, input, maxCycles = null) {
// TODO; Cache programs // TODO; Cache programs
let compiled = compile(program); let compiled = compile(program);
let vm = newVM(compiled, input); let vm = newVM(compiled, encodeUTF8(input));
let result = runVM(vm, maxCycles); let result = runVM(vm, maxCycles);
let output = result.state.output; let output = decodeUTF8(result.state.output);
// If didn't complete, mark it in the output // If didn't complete, mark it in the output
if(!result.complete) { if(!result.complete) {