diff --git a/README.md b/README.md index 09d3be7..63888c8 100644 --- a/README.md +++ b/README.md @@ -24,8 +24,8 @@ TODO * Optimization pass to turn multiply loops into commands that do `x += y * c` * Make VM use a Proxied object that gives out 0 for nonexistent elements for its memory -* Implement UTF-8 I/O * Keep a cache of compiled programs in `run()` +* Support for other types of EOF? ### gir.html * Implement a UI diff --git a/gir.js b/gir.js index 2296f16..8ae5fdc 100644 --- a/gir.js +++ b/gir.js @@ -442,7 +442,7 @@ function optimize(parsed) { // Virtual machine // ------------------------------------------------------------------ -// ([flatCommandObject]) → girVMState +// ([flatCommandObject], [int]) → girVMState function newVM(program, input) { return { // Initial state for the machine @@ -453,7 +453,7 @@ function newVM(program, input) { tapeHead: 0, input: input, - output: '' + output: [] }; } @@ -474,8 +474,9 @@ function runVM(state, maxCycles = null) { } let tapeHead = state.tapeHead; - let input = state.input; - let output = state.output; + // Create copies of input and output, since we might modify them + let input = state.input.slice(); + let output = state.output.slice(); let complete = false; let cycle = 0; @@ -526,22 +527,18 @@ function runVM(state, maxCycles = null) { case writeByte: if(!(index in memory)) memory[index] = 0; - // TODO: utf-8 - output += String.fromCodePoint(memory[index]); + output.push(memory[index]); ip++; break; case readByte: - // TODO: utf-8 // Have we reached EOF? if(input.length == 0) { // Yes, return 0 memory[index] = 0; } else { // No, return character - memory[index] = input.codePointAt(0); - // FIXME: This only works for BMP - input = input.slice(1); + memory[index] = input.shift(); } ip++; break; @@ -589,6 +586,117 @@ function runVM(state, maxCycles = null) { return {state: newState, complete: complete, cycles: cycle}; } +// ------------------------------------------------------------------ +// UTF-8 +// ------------------------------------------------------------------ + +// string → [int] +function encodeUTF8(string) { + let encoded = []; + + for(let character of string) { + let codepoint = character.codePointAt(0); + + if(codepoint < 0x80) { + // 0xxxxxxx + encoded.push(codepoint); + } else if(codepoint < 0x0800) { + // 110xxxxx 10xxxxxx + let b1 = codepoint >> 6 | 0b11000000; + let b2 = codepoint & 0b00111111 | 0b10000000; + encoded.push(b1); + encoded.push(b2); + } else if(codepoint < 0x10000) { + // 1110xxxx 10xxxxxx 10xxxxxx + let b1 = codepoint >> 12 | 0b11100000; + let b2 = codepoint >> 6 & 0b00111111 | 0b10000000; + let b3 = codepoint & 0b00111111 | 0b10000000; + encoded.push(b1); + encoded.push(b2); + encoded.push(b3); + } else { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + let b1 = codepoint >> 18 | 0b11110000; + let b2 = codepoint >> 12 & 0b00111111 | 0b10000000; + let b3 = codepoint >> 6 & 0b00111111 | 0b10000000; + let b4 = codepoint & 0b00111111 | 0b10000000; + encoded.push(b1); + encoded.push(b2); + encoded.push(b3); + encoded.push(b4); + } + } + + return encoded; +} + +// [int] → string +function decodeUTF8(encoded) { + let codePoints = []; + + for(let i = 0; i < encoded.length;) { + let codePoint = 0; + + let firstByte = encoded[i]; + i++; + + let toRead = null; + // Determine number of continuation bytes to read and + // decode the first byte into codePoint + // Since we'll do the shifts later, we just mask here + if(firstByte >> 7 == 0) { + // 0xxxxxxx + toRead = 0; + codePoint = firstByte; + } else if(firstByte >> 5 == 0b110) { + // 110xxxxx 10xxxxxx + toRead = 1; + codePoint = firstByte & 0b00011111; + } else if(firstByte >> 4 == 0b1110) { + // 1110xxxx 10xxxxxx 10xxxxxx + toRead = 2; + codePoint = firstByte & 0b00001111; + } else if(firstByte >> 3 == 0b11110) { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + toRead = 3; + codePoint = firstByte & 0b00000111; + } else { + // Illegal sequence, push replacement char + codePoints.push(0xFFFD); + continue; + } + + for(; toRead > 0 && i < encoded.length; toRead--) { + let continuationByte = encoded[i]; + i++; + + // Check that we have a valid continuation byte + if(continuationByte >> 6 == 0b10) { + // We do, add its contents to codePoint + codePoint = codePoint << 6 | + continuationByte & 0b00111111; + } else { + // We don't, break out of the loop + break; + } + } + + // Did we read all required continuation bytes? + if(toRead == 0) { + // We did, add the codepoint to the array + codePoints.push(codePoint); + } else { + // We didn't, push replacement char + codePoints.push(0xFFFD); + } + } + + // Convert to a string + let decoded = codePoints.map(x => String.fromCodePoint(x)).join(''); + + return decoded; +} + // ------------------------------------------------------------------ // User-facing functions // ------------------------------------------------------------------ @@ -598,14 +706,14 @@ function compile(program) { return optimize(parse(program)); } -// (string, string, bool) → string +// (string, string, int) → string function run(program, input, maxCycles = null) { // TODO; Cache programs let compiled = compile(program); - let vm = newVM(compiled, input); + let vm = newVM(compiled, encodeUTF8(input)); let result = runVM(vm, maxCycles); - let output = result.state.output; + let output = decodeUTF8(result.state.output); // If didn't complete, mark it in the output if(!result.complete) {