casio_doc/fontcharacter/tools/dumpbin.py

#!/usr/bin/env python3
""" Utility to dump data from a set binary file.
	Mainly here to check that the format is correct for now.
"""

import os, unicodedata
from functools import cmp_to_key
from argparse import ArgumentParser

# ---
# Decoding function.
# ---

def frombytes(bnum):
	if type(bnum) == int: return bnum
	return int.from_bytes(bnum, byteorder='big', signed=False)

leaders = {}
def decode_set1(braw, only_check=True):
	# Get the rest of the header, check the sum and filesize
	cmajors  = frombytes(braw[9])
	cchars   = frombytes(braw[10:12])
	flags    = frombytes(braw[12])
	pic_h    = frombytes(braw[13])
	pic_fmt  = frombytes(braw[14:16])
	checksum = frombytes(braw[20:24])
	filesize = frombytes(braw[24:28])
	datasize = frombytes(braw[28:32])

	# Get the flags.
	with_unicode = flags & 0x01 != 0
	with_cat     = flags & 0x02 != 0
	with_newcat  = flags & 0x04 != 0
	with_ctf     = flags & 0x08 != 0
	with_casemul = flags & 0x10 != 0

	# Make the entry size.
	centsize = 8 + 4 + 4 * \
		(with_unicode + with_cat + with_newcat + with_ctf + with_casemul)

	# Check the sizes and the checksum.
	if filesize != len(braw):
		print("ERROR: Invalid filesize!",
			"Declared %dB, calculated %dB"%(filesize, len(braw)))
		return 1
	calc_datasize = filesize - 32 - 4 * cmajors - centsize * cchars
	if datasize != calc_datasize:
		print("ERROR: Invalid datasize!",
			"Declared %dB, calculated %dB"%(datasize, calc_datasize))
		return 1
	calc_checksum = sum(braw[32:])
	if checksum != calc_checksum:
		print("ERROR: Invalid checksum!",
			"Declared 0x%08X, calculated 0x%08X"%(checksum, calc_checksum))
		return 1

	# Get the binary data.
	data_off = filesize - datasize
	bdata = braw[data_off:]

	# Get leaders.
	braw = braw[32:data_off]
	leads = []
	for id_major in range(cmajors):
		bmajor = braw[id_major * 4:id_major * 4 + 4]
		code = frombytes(bmajor[0])
		if code in leads:
			print("ERROR: Duplicate major 0x%02X."%code)
			return 1

		leads += [code]
		leaders[code] = {
			'start': frombytes(bmajor[2:]),
			'count': 0,
			'chars': {},
			'pos': id_major
		}

	# Sort leaders.
	def cmp_lead(x, y):
		global leaders
		if leaders[x]['start'] == leaders[y]['start']:
			return x if leaders[x]['pos'] < leaders[y]['pos'] else y
		return x if leaders[x]['start'] < leaders[y]['start'] else y
	leads.sort(key=cmp_to_key(cmp_lead))

	# Get counts.
	for id in range(len(leads) - 1):
		leaders[leads[id]]['count'] = \
			leaders[leads[id + 1]]['start'] - leaders[leads[id]]['start']
	leaders[leads[-1]]['count'] = cchars - leaders[leads[-1]]['start']

	# Get characters according to their leader.
	braw = braw[cmajors * 4:]
	for lead in leaders:
		for id_char in range(leaders[lead]['start'], \
		  leaders[lead]['start'] + leaders[lead]['count']):
			bchar = braw[id_char * centsize:id_char * centsize + centsize]
			code = frombytes(bchar[:2])
			if code >> 8 != lead:
				print("ERROR: character 0x%04X at position %d"%(code, id_char),
					"should have leader 0x%02X but"%lead,
					"has leader 0x%02X!"%(code >> 8))
				return 1
			if code in leaders[lead]['chars']:
				print("ERROR: duplicate character 0x%04X"%code,
					"at position %d"%id_char,
					"(prev. %d)"%leaders[lead]['chars']['pos'])
				return 1

			# Get the FONTCHARACTER sequence.
			mul_off = frombytes(bchar[8:12])
			mul_sz  = frombytes(bchar[2])
			mul = None
			if mul_sz:
				rmul = bdata[mul_off:mul_off + mul_sz]
				mul = []
				while rmul:
					if rmul[0] in leaders:
						mul += [(rmul[0] << 8) | rmul[1]]
						rmul = rmul[2:]
						continue
					mul += [rmul[0]]
					rmul = rmul[1:]

			# Get the Unicode string.
			uni = None
			off = 12
			if with_unicode:
				uni_sz = frombytes(bchar[3])
				uni_off = frombytes(bchar[off:off + 4])
				off += 4
				if uni_sz:
					runi = bdata[uni_off:uni_off + uni_sz]
					uni = runi.decode('utf-8')

			# TODO: get the rest
			leaders[lead]['chars'][code] = {
				'uni': uni,
				'mul': mul,
				'pos': id_char
			}

	if only_check:
		return 0
	print("OVERALL HEADER")
	print("%d bytes (data zone is %dB)"%(filesize, datasize))
	print("%d leader characters, %d characters"%(cmajors, cchars))
	print("")
	print("Tokens and sequences in this file:")
	print("- FONTCHARACTER sequences")
	if with_unicode: print("- Unicode equivalents")
	if with_cat:     print("- CAT tokens")
	if with_newcat:  print("- Newcat tokens")
	if with_ctf:     print("- CTF tokens")
	if with_casemul: print("- Casemul tokens")
	print("")

	for lead, data in leaders.items():
		print("0x%02X LEADER"%lead)
		print("Starts at character 0x%04X, stops at 0x%04X (count: %d)"
			%(data['start'], data['start'] + data['count'] - 1,
				data['count']))
		if data['chars']:
			print("")
		for code, char in data['chars'].items():
			print("- 0x%0*X"%(4 if code > 0xFF else 2, code), end='')
			mul = char['mul']
			if mul:
				m = ', '.join(map(lambda x:"0x%0*X"%(4 if x > 0xFF else 2, x),
					mul))
				print(" - seq: %s"%m, end='')

			uni = char['uni']
			if uni:
				if any(map(lambda c:unicodedata.category(c).startswith('C'), \
				  uni)):
					uni = ''.join(map(lambda x:'\\x%02X'%ord(x), uni))
				print(" - unicode: \"%s\""%uni, end='')
			print("")
		print("")
	return 0

def decode_set(braw, only_check=True):
	global leaders
	# Check the magic.
	bmagic = braw[:9]
	if bmagic[:8] != b"CASIOFC\x7f":
		print("ERROR: Invalid magic string!")
		return 1
	if bmagic[8] == 0x01:
		return decode_set1(braw, only_check)
	else:
		print("ERROR: Unmanaged version 0x%02X!"%bmagic[8])
		return 1

# ---
# Main function.
# ---

if __name__ == '__main__':
	# Parse the arguments.
	ap = ArgumentParser(description='FONTCHARACTER binary file dumper')

	ap.add_argument('--only-check', help='Should only check if the file is valid',
		action="store_true")
	ap.add_argument('input', help='The file which to dump the content.')
	args = ap.parse_args()

	# Obtain the file.
	braw = open(args.input, "rb").read()

	# Decode it.
	exit(decode_set(braw, args.only_check))

# End of file.