#!/usr/bin/env python
"""
arch/core.py
============
The architecture's core module implements essential classes
for the definition of new cpu architectures:
- the :class:`instruction` class models cpu instructions decoded by the disassembler.
- the :class:`disassembler` class implements the instruction decoding logic based \
on provided specifications.
- the :class:`ispec` class is a function decorator that allows to define the \
specification of an instruction.
- the :class:`Formatter` class is used for instruction pretty printing
"""
# This code is part of Amoco
# Copyright (C) 2006-2014 Axel Tillequin (bdcht3@gmail.com)
# published under GPLv2 license
import inspect
import importlib
import codecs
from types import FunctionType
from collections import defaultdict
from functools import reduce
import pyparsing as pp
from crysp.bits import Bits, pack, unpack
from amoco.logger import Log
logger = Log(__name__)
logger.debug("loading module")
from amoco.ui.render import Token, highlight
type_unpredictable = -1
type_undefined = 0
type_data_processing = 1
type_control_flow = 2
type_cpu_state = 3
type_system = 4
type_other = 5
INSTRUCTION_TYPES = {
type_unpredictable: "unpredictable",
type_undefined: "undefined",
type_data_processing: "data_processing",
type_control_flow: "control_flow",
type_cpu_state: "cpu_state",
type_system: "system",
type_other: "other",
}
[docs]class icore(object):
"""This is the core class for the generic parent instruction class below.
It defines the mandatory API for all instructions.
Attributes:
bytes (bytes) : instruction's bytes
type (int) : one of (type_data_processing, type_control_flow,
type_cpu_state, type_system, type_other) or
type_undefined (default) or type_unpredictable.
spec (ispec) : the specification that was decoded by the disassembler
to instanciate this instruction.
mnemonic (str) : the mnemonic string as defined by the specification.
operands (list): the list of operands' expressions.
misc (dict) : a defaultdict for passing various arch-dependent infos
(which returns None for undefined keys.)
"""
def __init__(self, istr=b""):
self.bytes = bytes(istr)
self.type = type_undefined
self.spec = None
self.mnemonic = None
self.operands = []
# we add a misc defaultdict container.
# see x86 specs for example of misc usage.
self.misc = defaultdict(_core_misc_default)
[docs] @classmethod
def set_uarch(cls, uarch):
"class method to define the instructions' semantics uarch dict"
cls._uarch = uarch
[docs] def typename(self):
"returns the instruction's type as a string"
return INSTRUCTION_TYPES[self.type]
def __call__(self, fmap):
"""calls the uarch[mnemonic] semantics function for this instruction
or warns if no semantics is found.
"""
if self.type in (type_undefined, type_unpredictable):
logger.error("%s instruction" % self.typename())
try:
i_xxx = self._uarch["i_%s" % self.mnemonic]
except AttributeError:
logger.warning("no uarch defined (%s)" % self.mnemonic)
except KeyError:
logger.warning("instruction %s not implemented" % self.mnemonic)
else:
i_xxx(self, fmap)
@property
def length(self):
"length of the instruction in bytes"
return len(self.bytes)
# instruction class
# -----------------
[docs]class instruction(icore):
"""The generic instruction class allows to define instruction for any cpu
instructions set and provides a common API for all arch-independent methods.
It extends the :class:`icore` with an :attr:`address` attribute and formatter
methods.
Attributes:
address (cst): the memory address where this instruction as been disassembled.
"""
def __init__(self, istr):
icore.__init__(self, istr)
self.address = None
def __repr__(self):
s = inspect.getmodule(self.spec.hook).__name__ if self.spec else ""
if self.address is not None:
s += " [%s] " % self.address
s += " %s ( " % self.mnemonic
for k, v in inspect.getmembers(self):
if k in (
"address",
"mnemonic",
"bytes",
"spec",
"operands",
"misc",
"formatter",
):
continue
if k.startswith("_") or inspect.ismethod(v):
continue
s += "%s=%s " % (k, v)
return "<%s)>" % s
def __str__(self):
return self.formatter(i=self)
[docs] def toks(self):
"returns the (unjoined) list of formatted tokens."
return self.formatter(i=self, toks=True)
[docs]class InstructionError(Exception):
def __init__(self, i):
self.ins = i
def __str__(self):
return repr(self.ins)
[docs]class DecodeError(Exception):
pass
def _core_misc_default():
return None
# disassembler core class
# ------------------------
[docs]class disassembler(object):
"""The generic disassembler class will decode a byte string based on provided
sets of instructions specifications and various parameters like endianess and
ways to select the appropriate instruction set.
Arguments:
specmodules: list of python modules containing ispec decorated funcs
iclass: the specific instruction class based on :class:`instruction`
iset: lambda used to select module (ispec list)
endian: instruction fetch endianess (1: little, -1: big)
Attributes:
maxlen: the length of the longest instruction found in provided specmodules.
iset: the lambda used to select the right specifications for decoding
endian: the lambda used to define endianess.
specs: the *tree* of :class:`ispec` objects that defines the cpu architecture.
"""
def __init__(
self,
specmodules,
iclass=instruction,
iset=(lambda *args, **kargs: 0),
endian=(lambda *args, **kargs: 1),
):
self.iclass = iclass
self.maxlen = max(
(s.mask.size // 8 for s in sum((m.ISPECS for m in specmodules), []))
)
self.iset = iset
self.endian = endian
# build ispecs tree for each set:
logger.debug("building specs tree for modules %s", [m.__name__ for m in specmodules])
# self.indent = 0
self.specs = [self.setup(m.ISPECS) for m in specmodules]
# del self.indent
# some arch like x86 require a stateful decoding due to optional prefixes,
# so we keep an __i instruction for decoding until a non prefix ispec is used.
self.__i = None
[docs] def setup(self, ispecs):
"""setup will (recursively) organize the provided ispecs list into an optimal tree so that
__call__ can efficiently find the matching ispec format for a given bytestring
(we don't want to search all specs until a match, so we need to separate formats as much
as possible). The output tree is (f,l) where f is the submask to check at this level
and l is a defaultdict such that l[x] is the subtree of formats for which submask is x.
"""
# self.indent += 2
# ind = ' '*self.indent
# sort ispecs from high constrained to low constrained:
# logger.debug('%scurrent subset count: %d',ind,len(ispecs))
ispecs.sort(key=(lambda x: x.mask.hw()), reverse=True)
if len(ispecs) < 5:
# logger.debug('%stoo small to divide',ind)
# self.indent -= 2
return (0, ispecs)
# find separating mask:
adjust = lambda x: x.ival
if self.endian() == -1:
# in bigendian cases (like ARM), bytes are supposed to be MSB-justified
# that means that a spec of length = 1 byte long needs to match the MSB of the
# encoded instruction.
maxsize = self.maxlen * 8
adjust = lambda x: x.ival << (maxsize - x.size)
localmask = reduce(lambda x, y: x & y, [adjust(s.mask) for s in ispecs])
if localmask == 0:
# logger.debug('%sno local mask',ind)
# self.indent -= 2
return (0, ispecs)
# subsetup:
f = localmask
# logger.debug('%slocal mask is %X',ind,f)
l = defaultdict(lambda: list())
for s in ispecs:
l[adjust(s.fix) & f].append(s)
if len(l) == 1: # if subtree has only 1 spec, we're done here
# logger.debug('%sfound 1 branch: done',ind)
# self.indent -=2
return (0, list(l.values())[0])
# logger.debug('%sfound %d branches',ind,len(l))
for x, S in l.items():
l[x] = self.setup(S)
# self.indent -=2
return (f, l)
def __call__(self, bytestring, **kargs):
e = self.endian(**kargs)
adjust = lambda x: x.ival
if e == -1:
maxsize = self.maxlen * 8
adjust = lambda x: x.ival << (maxsize - x.size)
b = adjust(Bits(bytestring[::e], bitorder=1))
# get organized/optimized tree of specs:
fl = self.specs[self.iset(**kargs)]
while True:
f, l = fl
if f == 0: # we are on a leaf...
for s in l: # lets search linearly over this branch
try:
i = s.decode(bytestring, e, i=self.__i, iclass=self.iclass)
except (DecodeError, InstructionError):
# logger.debug(u'exception raised by disassembler:'
# u'decoding %s with spec %s'%(codecs.encode(bytestring,'hex'),s.format))
continue
# we found the instruction (or prefix)
if i.spec.pfx is True:
if self.__i is None:
self.__i = i
return self(bytestring[s.mask.size // 8 :], **kargs)
elif i.spec.pfx > 0:
i.misc["xsz"] = i.spec.pfx
self.__i = None
if "address" in kargs:
i.address = kargs["address"]
return i
logger.debug(
"no instruction spec matching %s"
% (codecs.encode(bytestring, "hex"))
)
break
else: # go deeper in the tree according to submask value of b
fl = l.get(b & f, None)
if fl is None:
break
self.__i = None
return None
# -----------------------------------------
[docs]class ispec(object):
"""ispec (customizable) decorator
@ispec allows to easily define instruction decoders based on architecture specifications.
Arguments:
spec (str):
a human-friendly *format* string that describes how the ispec object will
(on request) decode a given bytestring and how it will expose various
decoded entities to the decorated function in order to define an instruction.
**kargs:
additional arguments to ispec decorator **must** be provided with ``name=value``
form and are declared as attributes/values within the instruction instance *before*
calling the decorated function. See below for conventions about names.
Attributes:
format (str): the spec format passed as argument (see Note below).
hook (callable): the decorated python function to be called during decoding. The hook
function name is relevant only for instructions' formatter.
See :class:`arch.core.Formatter`.
iattr (dict): the dictionary of instruction attributes to add before decoding.
Attributes and their values are passed from the spec's kargs when the
name does not start with an underscore.
fargs (dict): the dictionary of keywords arguments to pass to the hook.
These keywords are decoded from the format or given by the spec's kargs
when name starts with an underscore.
precond (func): an optional function that takes the instruction object as argument
and returns a boolean to indicate wether the hook can be called or not.
(This allows to avoid decoding when a prefix is missing for example.)
size (int): the bit length of the format (``LEN`` value)
fix (Bits): the values of fixed bits within the format
mask (Bits): the mask of fixed bits within the format
Examples:
This statement creates an ispec object with hook ``f``, and registers this object
automatically in a SPECS list object within the module where the statement is found::
@ispec("32[ .cond(4) 101 1 imm24(24) ]", mnemonic="BL", _flag=True)
def f(obj,imm24,_flag):
[...]
When provided with a bytestring, the :meth:`decode` method of this ispec object will:
- proceed with decoding ONLY if bits 27,26,25,24 are 1,0,1,1 or raise an exception
- instanciate an instruction object (obj)
- decode 4 bits at position [28,29,30,31] and provide this value as an integer \
in 'obj.cond' instruction instance attribute.
- decode 24 bits at positions 23..0 and provide this value as an integer as \
argument 'imm24' of the decorated function f.
- set obj.mnemonic to 'BL' and pass argument _flag=True to f.
- call f(obj,...)
- return obj
Note:
The ``spec`` string format is ``LEN ('<' or '>') '[' FORMAT ']' ('+' or '&' NUMBER)``
- ``LEN`` is either an integer that represents the bit length of the instruction or '*'.
Length must be a multiple of 8, '*' is used for a variable length
instruction.
- ``FORMAT`` is a series of *directives* (see below.)
Each directive represents a sequence of bits ordered according to the spec
direction : '<' (default) means that directives are ordered from MSB (bit index LEN-1)
to LSB (bit index 0) whereas '>' means LSB to MSB.
The spec string is optionally terminated with '+' to indicate that it
represents an instruction *prefix*, or by '&' NUMBER to indicate that the instruction
has a *suffix* of NUMBER more bytes to decode some of its operands.
In the *prefix* case, the bytestring matching the ispec format is stacked temporarily
until the rest of the bytestring matches a non prefix ispec.
In the *suffix* case, only the spec bytestring is used to define the instruction
but the :meth:`read_instruction` fetcher will provide NUMBER more bytes to the
:meth:`xdata` method of the instruction.
The directives defining the ``FORMAT`` string are used to associate symbols to bits
located at dedicated offsets within the bitstring to be decoded. A directive has the
following syntax:
* ``-`` (indicates that current bit position is not decoded)
* ``0`` (indicates that current bit position must be 0)
* ``1`` (indicates that current bit position must be 1)
or
* ``type SYMBOL location`` where:
* ``type`` is an *optional* modifier char with possible values:
* ``.`` indicates that the ``SYMBOL`` will be an *attribute* of the :class:`instruction`.
* ``~`` indicates that the decoded value will be returned as a Bits instance.
* ``#`` indicates that the decoded value will be returned as a string of [01] chars.
* ``=`` indicates that decoding should *end* at current position (overlapping)
if not present, the ``SYMBOL`` will be passed as a keyword argument to the function with
value decoded as an integer.
* ``SYMBOL``: is a mandatory string matching regex ``[A-Za-z_][0-9A-Za-z_]*``
* ``location``: is an optional string matching the following expressions:
* ``( len )`` : indicates that the value is decoded from the next len bits starting
from the current position of the directive within the ``FORMAT`` string.
* ``(*)`` : indicates a *variable length directive* for which the value is decoded
from the current position with all remaining bits in the ``FORMAT``.\
If the ``LEN`` is also variable then all remaining bits from the instruction
buffer input string are used.
default location value is ``(1)``.
The special directive ``{byte}`` is a shortcut for 8 fixed bits. For example
``8>[{2f}]`` is equivalent to ``8>[ 1111 0100 ]``, or ``8<[ 0010 1111 ]``.
"""
__slots__ = [
"format",
"iattr",
"fargs",
"precond",
"ast",
"fix",
"mask",
"pfx",
"size",
"hook",
]
def __init__(self, format, **kargs):
self.format = format
self.setup(kargs)
# when ispec is used as a function decorator, hook holds the decorated function
self.hook = None
def __getstate__(self):
D = {}
D["format"] = self.format
D["module"] = self.hook.__module__
return D
def __setstate__(self, state):
self.format = state["format"]
modname = state["module"]
m = importlib.import_module(modname)
self.hook = None
for h in m.ISPECS:
if h.format == self.format:
self.hook = h.hook
break
def setup(self, kargs):
self.iattr = {}
self.fargs = {}
self.precond = None
for k, v in iter(kargs.items()):
if k.startswith("_"):
if k=="__obj":
self.precond = v
else:
self.fargs[k] = v
else:
self.iattr[k] = v
self.ast = self.buildspec()
def fixed(self):
s = list(str(self.fix))
for i, x in enumerate(self.mask):
if x == 0:
s[i] = "-"
if self.ast[0][1] == "<":
s.reverse()
return "".join(s)
def buildspec(self):
ast = specdecode.parseString(self.format, True)
size, direction = ast[0]
self.size = size
fmt = ast[1]
self.pfx = ast[2]
xsz = ast[3]
if self.pfx and xsz:
self.pfx = xsz
go = +1
chklen = True
if direction == "<": # format goes from high bits to low bits
fmt = list(reversed(fmt))
go = -1
if size == "*":
self.size = 0
chklen = False
size = 0
for d in fmt:
if d in ("-", "0", "1"):
size += 1
elif isinstance(d, Bits):
size += d.size
else:
loc = d[2]
if loc == "*":
break
if d[0]!='=':
size += loc
if size % 8 != 0:
logger.error("ispec length %d not a multiple of 8 %s" % (size,self.format))
self.fix = Bits(0, size) # values of fixed bits
self.mask = Bits(0, size) # location of fixed bits
i = 0
count = 0
for d in fmt:
if chklen and not i < size:
logger.error("ispec format too wide %s" % self.format)
# unknown bit (skipped)
if d == "-":
i += 1
count += 1
continue
# fixed bit:
if d in ("0", "1"):
self.fix[i] = int(d)
self.mask[i] = 1
i += 1
count += 1
continue
# fixed byte:
if isinstance(d, Bits):
self.fix[i : i + d.size] = d
self.mask[i : i + d.size] = d.mask
i += d.size
count += d.size
continue
# directive:
opt, symbol, loc = d
if loc != "*":
if opt == "=" and go > 0:
i = i - loc
sta = i
sto = i + loc
if sta < 0 or sto > size:
logger.error("ispec directive out of bound in %s" % self.format)
if opt != "=":
count += loc
i = sto
if opt == "=" and go < 0:
i = i - loc
else:
if opt == "=":
logger.error("ispec directive invalid length in %s" % self.format)
sta = i
sto = None
i = size
if count < size:
count = size
chklen = True
# now set D (fargs or iattr) to corresponding extractor lambdas which
# will be called when decode is called by the disassembler:
D = self.fargs
if "." in opt:
D = self.iattr
if symbol in D:
raise logger.error("ispec symbol %s redefined" % symbol)
if "~" in opt:
f = lambda b, p=sta, q=sto: b[p:q]
elif "#" in opt:
f = lambda b, p=sta, q=sto, x=go: str(b[p:q])[::x]
else:
f = lambda b, p=sta, q=sto: b[p:q].ival
D[symbol] = f
if count != size:
logger.error("ispec size mismatch (%s)" % self.format)
return ast
# decode always receive input bytes in ascending memory order
def decode(self, istr, endian=1, i=None, iclass=instruction):
# check spec :
blen = self.fix.size // 8
if len(istr) < blen:
raise DecodeError
bs = istr[0:blen]
# Bits object created with LSB to MSB byte string:
ival = bs[::endian]
b = Bits(ival, self.fix.size, bitorder=1)
if b & self.mask != self.fix:
raise DecodeError
if self.size == 0: # variable length spec:
if endian != 1:
logger.error("invalid endianess")
b = b // Bits(istr[blen:], bitorder=1)
# create & update instruction object:
if i is None:
i = iclass(bs)
else:
i.bytes += bs
i.spec = self
# set instruction attributes from directives, and then
# call hook function with instruction as first parameter
# and fargs (note that hook can thus overwrite previous attributes)
for k, v in iter(self.iattr.items()):
if isinstance(v, FunctionType):
v = v(b)
setattr(i, k, v)
kargs = {}
for k, v in iter(self.fargs.items()):
if isinstance(v, FunctionType):
v = v(b)
kargs[k] = v
# and finally call the hook:
try:
# check any precondition on i:
if self.precond and (not self.precond(i)):
raise InstructionError(i)
# ok, lets call the spec hook...
self.hook(obj=i, **kargs)
except InstructionError:
# clean up:
i.bytes = i.bytes[: -len(bs)]
for k in iter(self.iattr.keys()):
delattr(i, k)
raise InstructionError(i)
return i
def encode(self, i):
raise NotImplementedError
# decorate:
def __call__(self, handler):
m = inspect.getmodule(handler)
ispec_register(self, m)
varnames = handler.__code__.co_varnames
fname = handler.__name__
for k in iter(self.fargs.keys()):
if k not in varnames:
logger.error("ispec symbol not found in decorated function %s" % fname)
self.hook = handler
return handler
# -------------------------------------------------
# ispec format parser:
# ---------------------
integer = pp.Regex(r"[1-9][0-9]*")
indxdir = pp.oneOf(["<", ">"])
fixbit = pp.oneOf(["0", "1"])
number = integer | fixbit
number.setParseAction(lambda r: int(r[0]))
unklen = pp.Literal("*")
length = number | unklen
unkbit = pp.oneOf(["-"])
fixbyte = pp.Regex(r"{[0-9a-fA-F][0-9a-fA-F]}").setParseAction(
lambda r: Bits(int(r[0][1:3], 16), 8)
)
fixed = fixbyte | fixbit | unkbit
option = pp.oneOf([".", "~", "#", "="])
symbol = pp.Regex(r"[A-Za-z_][A-Za-z0-9_]*")
location = pp.Suppress("(") + length + pp.Suppress(")")
directive = pp.Group(
pp.Optional(option, default="") + symbol + pp.Optional(location, default=1)
)
speclen = pp.Group(length + pp.Optional(indxdir, default="<"))
specformat = pp.Group(
pp.Suppress("[") + pp.OneOrMore(directive | fixed) + pp.Suppress("]")
)
specoption = pp.Optional(pp.Literal("+").setParseAction(lambda r: True), default=False)
specmore = pp.Optional(pp.Suppress("&") + number, default=0)
specdecode = speclen + specformat + specoption + specmore
def ispec_register(x, module):
F = []
try:
S = module.ISPECS
except AttributeError:
logger.error("spec modules must declare ISPECS=[] before @ispec decorators")
raise AttributeError
f = x.fixed()
if f in F:
logger.error(
"ispec conflict for %s (vs. %s)" % (x.format, S[F.index(f)].format)
)
else:
if x.mask != 0:
S.append(x)
F.append(f)
def test_parser():
while 1:
try:
res = raw_input("ispec>")
s = ispec(res, mnemonic="TEST")
print(s.ast)
return s
except EOFError:
return
if __name__ == "__main__":
test_parser()