From 3a9c4baa4b99d77627092c12d27a693c5cdf711a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juhani=20Krekel=C3=A4?= <juhani.haverinen@gmail.com>
Date: Thu, 30 Aug 2018 00:16:22 +0300
Subject: [PATCH] Move to a space-separated format

---
 src/hashing.py     | 42 ++++++++++++++++++++
 src/read_file.py   | 97 +++++++++++++++++++++-------------------------
 src/write_file.py  | 22 +++++++----
 sshwot-format.text | 32 +++++++--------
 4 files changed, 117 insertions(+), 76 deletions(-)

diff --git a/src/hashing.py b/src/hashing.py
index e6a33d4..0b8ff23 100644
--- a/src/hashing.py
+++ b/src/hashing.py
@@ -1,3 +1,4 @@
+import base64
 import hashlib
 import os
 
@@ -23,3 +24,44 @@ def hash_host(host):
 	salt = generate_salt()
 	hashed_host = hash_with_salt(host, salt)
 	return salt, hashed_host
+
+def base64enc(b):
+	"""base64enc(bytes) → bytes
+	Uses no padding"""
+	# Base 64 encodes 3 bytes as 4 characters
+	# /byte 1\/byte 2\/byte 3\
+	# ABCDEFGHijklmnopQRSTUVWX
+	# \64 1/\64 2/\64 3/\64 4/
+	#
+	# If you have only one or two bytes, you don't have enough bits to
+	# fill all of the characters. The rest of the bits will be taken to
+	# be zeroes.
+	# /byte 1\
+	# ABCDEFGH0000
+	# \64 1/\64 2/
+	# /byte 1\
+	#
+	# /byte 1\/byte 2\
+	# ABCDEFGHijklmnop00
+	# \64 1/\64 2/\64 3/
+	#
+	# This way you end up with only 2 or 3 characters containing info.
+	# This usually gets padded into a multiple of 4 with =. However,
+	# since the amount of bytes left over mod 4 is enough to generate
+	# the padding back, we can strip it out
+	return base64.b64encode(b).replace(b'=', b'')
+
+def base64dec(b64):
+	"""base64dec(bytes) → bytes
+	Can handle lack of padding."""
+	assert type(b64) == bytes
+
+	# Padded base64 is always a multiple of 4 bytes in length. The
+	# reasoning for this is because base64 decoding operates in groups
+	# of 4 base64 characters.
+	# Since we know the length of the string minus the padding, we can
+	# just pad it to the nearest multiple of 4
+
+	missing_padding_len = (4 - len(b64)%4) % 4
+	padding = b'=' * missing_padding_len
+	return base64.b64decode(b64 + padding, validate = True)
diff --git a/src/read_file.py b/src/read_file.py
index 619a645..8135104 100644
--- a/src/read_file.py
+++ b/src/read_file.py
@@ -1,4 +1,4 @@
-import base64
+import hashing
 
 import entry
 
@@ -30,88 +30,79 @@ def parse_header(header):
 	(if any) if it is"""
 	assert type(header) == bytes
 
-	magic = header[0:6]
-	if magic != b'SSHWOT':
+	# Check that it ends in a newline
+	if len(header) == 0 or header[-1] != 0x0a:
+		raise FileFormatError('No newline after header')
+
+	# Split it into fields and make sure we have at least the magic and
+	# the version
+	fields = header[:-1].split(b' ', 3)
+	if len(fields) < 2:
+		raise FileFormatError('Too few fields in the header, expected at least magic and version')
+
+	# Check the magic
+	if fields[0] != b'SSHWOT':
 		raise FileFormatError('Invalid magic')
 
-	# Version 0 is the current one
-	version = header[6:7]
-	if version == b'':
-		raise FileFormatError('No newline after header')
-	if version != b'0':
-		raise VersionMismatch('Version %i not supported' % version[0])
-
 	# See if we have a comment
-	if header[7:8] == b' ':
+	if len(fields) == 3:
 		# It says we have
-		if header[8:9] == b'\n':
+		if len(fields[2]) == 0:
 			# No, we don't, but we do have a space telling we
 			# have. The header is malformed
 			raise FileFormatError('Missing comment or spurious space in the header')
 		else:
-			# Yes, we do
-			# Check it ends with a newline
-			if header[-1] != 0x0a:
-				raise FileFormatError('Missing newline at the end of the header')
-
+			# Yes, we do. Extract it
 			try:
-				file_comment = header[8:-1].decode('utf-8')
+				file_comment = fields[2].decode('utf-8')
 			except UnicodeDecodeError:
 				raise FileFormatError('Comment is not valid utf-8')
 
-			return file_comment
-
-	elif header[7:8] == b'\n':
-		# No, we have newline
-		return ''
-
 	else:
-		# No, we have something else
-		raise FileFormatError("Expected a space or a newline but got '%s' instead" % header[7:].decode('utf-8'))
+		file_comment = ''
+
+	return file_comment
 
 def parse_entry(line):
 	"""parse_entry(bytes) → Entry"""
 	assert type(line) == bytes
 
-	def extract_b64_field(rest):
-		"""extract_b64_field(bytes) → (bytes: decoded_field, bytes:rest)"""
-		field_b64 = rest[0:44]
-		if len(field_b64) != 44:
-			raise FileFormatError('Unexpected end of line')
+	def decode_b64_field(b64):
 		try:
-			field = base64.b64decode(field_b64, validate = True)
+			return hashing.base64dec(b64)
 		except (ValueError, base64.binascii.Error) as err:
-			raise FileFormatError('Malformed base64 string: %s' % field_b64.decode('utf-8')) from err
+			raise FileFormatError('Malformed base64 string: %s' % b64.decode('utf-8')) from err
 
-		return field, rest[44:]
+	# Check that it ends in a newline
+	if len(line) == 0 or line[-1] != 0x0a:
+		raise FileFormatError('No newline after entry')
 
-	salt, rest = extract_b64_field(line)
-	hashed_host, rest = extract_b64_field(rest)
-	fingerprint, rest = extract_b64_field(rest)
+	# Split the line into fields and make sure we have at least the
+	# salt, the hashed host, and the fingerprint
+	fields = line[:-1].split(b' ', 3)
+	if len(fields) < 3:
+		raise FileFormatError('Too few fields in the entry, expected in the very least salt, hashed host, and fingerprint')
 
-	# What do we have after that?
-	if rest[0:1] == b' ':
-		# A comment?
-		if rest[1:2] == b'\n':
-			# No, but it says we have. It's malformed
+	salt = decode_b64_field(fields[0])
+	hashed_host = decode_b64_field(fields[1])
+	fingerprint = decode_b64_field(fields[2])
+
+	# See if we have a comment
+	if len(fields) == 4:
+		# It says we have
+		if len(fields[3]) == 0:
+			# No, we don't, but we do have a space telling we
+			# have. The header is malformed
 			raise FileFormatError('Missing comment or spurious space in the entry')
 		else:
-			# Yes. Make sure it ends in a newline
-			if rest[-1] != 0x0a:
-				raise FileFormatError('No newline after entry')
-
+			# Yes, we do. Extract it
 			try:
-				comment = rest[1:-1].decode('utf-8')
+				comment = fields[3].decode('utf-8')
 			except UnicodeDecodeError:
 				raise FileFormatError('Comment is not valid utf-8')
 
-	elif rest[0:1] == b'\n':
-		# A newline
-		comment = ''
-
 	else:
-		# Something else
-		raise FileFormatError('Expected a space or a newline but got "%s" instead' % rest.decode('utf-8'))
+		comment = ''
 
 	return entry.Entry(salt, hashed_host, fingerprint, comment)
 
diff --git a/src/write_file.py b/src/write_file.py
index 60614eb..0230384 100644
--- a/src/write_file.py
+++ b/src/write_file.py
@@ -1,4 +1,4 @@
-import base64
+import hashing
 
 def write_header(f, file_comment):
 	"""write_header(file(wb), str)
@@ -6,6 +6,8 @@ def write_header(f, file_comment):
 	assert type(file_comment) == str
 	# b'SSHWOT' magic
 	f.write(b'SSHWOT')
+	# Separating space
+	f.write(b' ')
 	# Version number
 	f.write(b'0')
 	# b' ' + file_comment, if there is one
@@ -24,14 +26,20 @@ def write_entry(f, salt, hashed_host, fingerprint, comment):
 	assert type(fingerprint) == bytes and len(fingerprint) == 32
 	assert type(comment) == str
 
-	# base64 encoded (44 bytes): salt
-	f.write(base64.b64encode(salt))
+	# base64 encoded salt
+	f.write(hashing.base64enc(salt))
 
-	# base64 encoded (44 bytes): hashed_host
-	f.write(base64.b64encode(hashed_host))
+	# Separating space
+	f.write(b' ')
 
-	# base64 encoded (44 bytes): fingerprint
-	f.write(base64.b64encode(fingerprint))
+	# base64 encoded hashed_host
+	f.write(hashing.base64enc(hashed_host))
+
+	# Separating space
+	f.write(b' ')
+
+	# base64 encoded fingerprint
+	f.write(hashing.base64enc(fingerprint))
 
 	# b' ' + comment, if there is one
 	if len(comment) > 0:
diff --git a/sshwot-format.text b/sshwot-format.text
index 04e6764..5c84651 100644
--- a/sshwot-format.text
+++ b/sshwot-format.text
@@ -1,34 +1,34 @@
 Please note that all text insire quotes in the EBNF here is to be taken to
-mean bytes that would decode as that using either the ASCII or the UTF-8
-character encoding. "\n" refers specifically to the byte 0x0a, and no
-alternative newlines are acceptable.
+mean bytes that would decode as that using the ASCII. character encoding.
+"\n" refers specifically to the byte 0x0a, and no alternative newlines are
+acceptable.
 
 The file has a header like:
 
 magic   = "SSHWOT" ;
 version = "0" ;
-comment = " ", ? General comment about the file. Valid utf-8, no '\n'. ? ;
-header  = magic, version, [comment], "\n" ;
+comment = ? General comment about the file. Valid utf-8, no '\n'. ? ;
+header  = magic, " ", version, [" ", comment], "\n" ;
 
-Examples of valid headers would be "SSHWOT0\n" and "SSHWOT0 Emma G. 2018\n".
+Examples of valid headers would be "SSHWOT 0\n" and "SSHWOT 0 Emma G. 2018\n".
 
-"SSHWOT0 \n" is not valid, since a space marks that there will be a comment.
+"SSHWOT 0 \n" is not valid, since a space marks that there will be a comment.
 
 After the header the entries are laid out as:
 
-salt        = ? base64 encoded salt, 44 bytes long ? ;
-hashed host = ? base64 encoded sha256(host concat salt), 44 bytes long ? ;
-fingerprint = ? base64 encoded sha256-fingerprint, 44 bytes long ? ;
-comment     = " ", ? Comment about the host/key. Valid utf-8, no '\n'. ? ;
-entry       = salt, hashed host, fingerprint, [comment], "\n" ;
+salt        = ? base64(salt) ? ;
+hashed host = ? base64(sha256(host concat salt)) ? ;
+fingerprint = ? base64(sha256-fingerprint) ? ;
+comment     = ? Comment about the host/key. Valid utf-8, no '\n'. ? ;
+entry       = salt, " ", hashed host, " ", fingerprint, [" ", comment], "\n" ;
 
-The version of base64 used uses + for 62 and / for 63, uses = for padding,
-and contains no breaks.
+The version of base64 used uses + for 62 and / for 63, doesn't use = for
+padding, and contains no breaks.
 
 Examples of valid entries are
-"Yixx+B6zrFoubPhBddgyx0nXHmbqMW1Wzneo4JqJv0U=yPUACFC/zPt/ENoIluOuWiTXor3r7oHhac63qej637E=QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVpbXF1eX2A=\n"
+"Yixx+B6zrFoubPhBddgyx0nXHmbqMW1Wzneo4JqJv0U yPUACFC/zPt/ENoIluOuWiTXor3r7oHhac63qej637E QUJDREVGR0hJSktMTU5PUFFSU1RVVldYWVpbXF1eX2A\n"
 and
-"bd/MfFs+DMVqNQQoZGGCvpTopeS0/Jt6GS5vg7J+638=cbbdTnuIh0ZwnM+/r3sAu4iHgaN3mpkcP9kJND4vBUo=YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXp7fH1+f4A= The old one\n"
+"bd/MfFs+DMVqNQQoZGGCvpTopeS0/Jt6GS5vg7J+638 cbbdTnuIh0ZwnM+/r3sAu4iHgaN3mpkcP9kJND4vBUo YWJjZGVmZ2hpamtsbW5vcHFyc3R1dnd4eXp7fH1+f4A The old one\n"
 
 Again, if there is a space following the necessary parts, there must also be
 a comment or else the entry is malformed.