From 05e8df00a30d47d66b448c9776604da2dd00a27e Mon Sep 17 00:00:00 2001 From: Tristan Ravitch Date: Sun, 28 Nov 2021 23:53:22 -0800 Subject: [PATCH] Add a concrete syntax for macaw This will enable caching analysis results in a text format that can be parsed much more efficiently than re-running the entire fixed point analysis. --- base/macaw-base.cabal | 4 + base/src/Data/Macaw/Syntax/Atom.hs | 131 +++++++++ base/src/Data/Macaw/Syntax/Parser.hs | 386 +++++++++++++++++++++++++++ base/src/Data/Macaw/Syntax/SExpr.hs | 245 +++++++++++++++++ 4 files changed, 766 insertions(+) create mode 100644 base/src/Data/Macaw/Syntax/Atom.hs create mode 100644 base/src/Data/Macaw/Syntax/Parser.hs create mode 100644 base/src/Data/Macaw/Syntax/SExpr.hs diff --git a/base/macaw-base.cabal b/base/macaw-base.cabal index d6305deb..88723046 100644 --- a/base/macaw-base.cabal +++ b/base/macaw-base.cabal @@ -37,6 +37,7 @@ library galois-dwarf >= 0.2.2, IntervalMap >= 0.5, lens >= 4.7, + megaparsec >= 7 && < 10, mtl, parameterized-utils >= 2.1.0 && < 2.2, prettyprinter >= 1.7.0, @@ -78,6 +79,9 @@ library Data.Macaw.Memory.LoadCommon Data.Macaw.Memory.Permissions Data.Macaw.Memory.Symbols + Data.Macaw.Syntax.Atom + Data.Macaw.Syntax.Parser + Data.Macaw.Syntax.SExpr Data.Macaw.Types Data.Macaw.Utils.Changed Data.Macaw.Utils.IncComp diff --git a/base/src/Data/Macaw/Syntax/Atom.hs b/base/src/Data/Macaw/Syntax/Atom.hs new file mode 100644 index 00000000..64119b84 --- /dev/null +++ b/base/src/Data/Macaw/Syntax/Atom.hs @@ -0,0 +1,131 @@ +{-# LANGUAGE OverloadedStrings #-} +-- | The atoms of the concrete syntax for macaw +module Data.Macaw.Syntax.Atom ( + Keyword(..) + , keywords + , AtomName(..) + , Atom(..) + ) + where + +import qualified Data.Map.Strict as Map +import qualified Data.Text as T +import Numeric.Natural ( Natural ) + +-- | Macaw syntax keywords +-- +-- These are the keywords for the *base* macaw IR (i.e., without +-- architecture-specific extensions). The architecture-specific operations are +-- parsed as 'AtomName's initially and resolved by architecture-specific parsers +-- at the atom level. +data Keyword = BVAdd + | BVSub + | BVMul + | BVAdc + | BVSbb + | BVAnd + | BVOr + | BVXor + | BVShl + | BVShr + | BVSar + | PopCount + | Bsf + | Bsr + | BVComplement + | Mux + | Lt + | Le + | Sle + | Slt + -- Syntax + | Assign + -- Statements + | Comment + | InstructionStart + | WriteMemory + | CondWriteMemory + -- Expressions + | ReadMemory + -- Boolean operations + | Eq_ + | Not_ + | And_ + | Or_ + | Xor_ + -- Endianness + | BigEndian + | LittleEndian + -- MemRepr + | BVMemRepr + -- Types + | Bool_ + | BV_ + | Float_ + | Tuple_ + | Vec_ + -- Values + | True_ + | False_ + | BV + | Undefined + deriving (Eq, Ord, Show) + +-- | Uninterpreted atoms +newtype AtomName = AtomText T.Text + deriving (Eq, Ord, Show) + +data Atom = Keyword !Keyword -- ^ Keywords include all of the built-in expressions and operators + | AtomName !AtomName -- ^ Non-keyword syntax atoms (to be interpreted at parse time) + | Register !Natural -- ^ A numbered local register (e.g., @r12@) + | Address !Natural -- ^ An arbitrary address rendered in hex ('ArchAddrWord' or 'SegoffAddr') + | Integer_ !Integer -- ^ Literal integers + | Natural_ !Natural -- ^ Literal naturals + | String_ !T.Text -- ^ Literal strings + deriving (Eq, Ord, Show) + +keywords :: Map.Map T.Text Keyword +keywords = Map.fromList [ ("bv-add", BVAdd) + , ("bv-sub", BVSub) + , ("bv-mul", BVMul) + , ("bv-adc", BVAdc) + , ("bv-sbb", BVSbb) + , ("bv-and", BVAnd) + , ("bv-or", BVOr) + , ("bv-xor", BVXor) + , ("bv-shl", BVShl) + , ("bv-shr", BVShr) + , ("bv-sar", BVSar) + , ("bv-complement", BVComplement) + , ("popcount", PopCount) + , ("bit-scan-forward", Bsf) + , ("bit-scan-reverse", Bsr) + , ("mux", Mux) + , ("eq", Eq_) + , ("<", Lt) + , ("<=", Le) + , ("<$", Slt) + , ("<=$", Sle) + , ("not", Not_) + , ("and", And_) + , ("or", Or_) + , ("xor", Xor_) + , ("Bool", Bool_) + , ("BV", BV_) + , ("Float", Float_) + , ("Tuple", Tuple_) + , ("Vec", Vec_) + , ("true", True_) + , ("false", False_) + , ("bv", BV) + , ("undefined", Undefined) + , ("read-memory", ReadMemory) + , (":=", Assign) + , ("comment", Comment) + , ("instruction-start", InstructionStart) + , ("write-memory", WriteMemory) + , ("cond-write-memory", CondWriteMemory) + , ("big-endian", BigEndian) + , ("little-endian", LittleEndian) + , ("bv-mem", BVMemRepr) + ] diff --git a/base/src/Data/Macaw/Syntax/Parser.hs b/base/src/Data/Macaw/Syntax/Parser.hs new file mode 100644 index 00000000..6129a248 --- /dev/null +++ b/base/src/Data/Macaw/Syntax/Parser.hs @@ -0,0 +1,386 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE TypeOperators #-} +{-# LANGUAGE ViewPatterns #-} +-- | This module defines a concrete syntax parser for macaw, which is meant to +-- make it easy to persist analysis results and reuse them without recomputing +-- everything +-- +-- The persisted artifact is a zip file with one file per function, plus a +-- manifest file that records metadata about the binary to enable the loader to +-- check to ensure that the loaded macaw IR actually matches the given binary. +module Data.Macaw.Syntax.Parser ( + parseDiscoveryFunInfo + ) where + +import Control.Applicative ( (<|>) ) +import qualified Data.ByteString as BS +import qualified Data.Map as Map +import qualified Data.Parameterized.Classes as PC +import qualified Data.Parameterized.NatRepr as PN +import Data.Parameterized.Some ( Some(..) ) +import qualified Data.Text as T +import GHC.TypeLits ( type (<=) ) +import Numeric.Natural ( Natural ) +import qualified Text.Megaparsec as TM +import qualified Text.Megaparsec.Char as TMC +import qualified Text.Megaparsec.Char.Lexer as TMCL + +import qualified Data.Macaw.CFG as MC +import qualified Data.Macaw.Discovery as MD +import qualified Data.Macaw.Discovery.State as MDS +import qualified Data.Macaw.Discovery.ParsedContents as Parsed +import qualified Data.Macaw.Memory as MM +import Data.Macaw.Syntax.Atom +import qualified Data.Macaw.Syntax.SExpr as SExpr +import qualified Data.Macaw.Types as MT + +-- | Identifiers are like scheme identifiers (i.e., start with an alphabetic +-- character, can contain alphanumeric characters plus dashes and underscores) +parseIdentifier :: SExpr.Parser arch ids T.Text +parseIdentifier = do + c1 <- TMC.letterChar + cs <- TM.many (TM.try TMC.alphaNumChar <|> TM.try (TMC.char '-') <|> TMC.char '_') + return (T.pack (c1 : cs)) + +parseKeywordOrAtom :: SExpr.Parser arch ids Atom +parseKeywordOrAtom = do + x <- parseIdentifier + return $! maybe (AtomName (AtomText x)) Keyword (Map.lookup x keywords) + +parseAddress :: SExpr.Parser arch ids Atom +parseAddress = Address <$> (TMC.string "0x" >> TMCL.hexadecimal) + +parseNatural :: SExpr.Parser arch ids Atom +parseNatural = Natural_ <$> TMCL.decimal + +parseRegister :: SExpr.Parser arch ids Atom +parseRegister = Register <$> (TMC.char 'r' >> TMCL.decimal) + +parseInteger :: SExpr.Parser arch ids Atom +parseInteger = TM.try (Integer_ <$> (TMC.char '+' >> TMCL.decimal)) + <|> (Integer_ <$> (TMC.char '-' >> TMCL.decimal)) + +parseString :: SExpr.Parser arch ids Atom +parseString = (String_ . T.pack) <$> (TMC.char '"' >> TM.manyTill TMCL.charLiteral (TMC.char '"')) + +-- | Parse a single 'Atom' +-- +-- Note that the order of these parsers matters a lot. We have to parse +-- registers before general atoms, as they overlap +parseAtom :: SExpr.Parser arch ids Atom +parseAtom = TM.try parseRegister + <|> TM.try parseKeywordOrAtom + <|> TM.try parseNatural + <|> TM.try parseInteger + <|> TM.try parseAddress + <|> parseString + +parse + :: (SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError a) + -> SExpr.Parser arch ids a +parse asSomething = do + at <- SExpr.sexp parseAtom + case asSomething at of + Left err -> TM.customFailure err + Right r -> return r + +parseSExpr + :: forall arch ids a + . SExpr.Syntax Atom + -> (SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError a) + -> SExpr.Parser arch ids a +parseSExpr sexp asSomething = + case asSomething sexp of + Left err -> TM.customFailure err + Right r -> return r + +data WidthRepr where + WidthRepr :: (1 <= n) => PN.NatRepr n -> WidthRepr + +-- | Attempt to convert a 'Natural' into a non-zero 'PN.NatRepr' +asWidthRepr :: Natural -> Maybe WidthRepr +asWidthRepr nat = + case PN.mkNatRepr nat of + Some nr + | Right PN.LeqProof <- PN.isZeroOrGT1 nr -> Just (WidthRepr nr) + | otherwise -> Nothing + +asEndianness :: SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError MM.Endianness +asEndianness at = + case at of + SExpr.A (Keyword BigEndian) -> Right MM.BigEndian + SExpr.A (Keyword LittleEndian) -> Right MM.LittleEndian + _ -> Left (SExpr.InvalidEndianness at) + +asMemRepr :: SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError (SomeTyped MC.MemRepr) +asMemRepr at = + case at of + SExpr.L [ SExpr.A (Keyword BVMemRepr), SExpr.A (Natural_ w), send ] + | Just (WidthRepr nr) <- asWidthRepr w -> do + end <- asEndianness send + let nr8 = PN.natMultiply (PN.knownNat @8) nr + -- This cannot fail because we already checked that @w@ is >= 1 above + case PN.isZeroOrGT1 nr8 of + Left _ -> error ("Impossible; w was >= 1, so w * 8 must be >= 1: " ++ show nr8) + Right PN.LeqProof -> do + let tyRep = MT.BVTypeRepr nr8 + return (SomeTyped tyRep (MC.BVMemRepr nr end)) + +data SomeTyped f where + SomeTyped :: MT.TypeRepr tp -> f tp -> SomeTyped f + +-- | Note: this does not yet handle relocatable values +asValue :: SExpr.Syntax Atom -> SExpr.Parser arch ids (SomeTyped (MC.Value arch ids)) +asValue at = do + as <- SExpr.getArchSyntax + case at of + SExpr.A (Keyword True_) -> return (SomeTyped MT.BoolTypeRepr (MC.BoolValue True)) + SExpr.A (Keyword False_) -> return (SomeTyped MT.BoolTypeRepr (MC.BoolValue False)) + SExpr.L [SExpr.A (Keyword BV), SExpr.A (Natural_ w), SExpr.A (Integer_ i)] + | Just (WidthRepr nr) <- asWidthRepr w -> return (SomeTyped (MT.BVTypeRepr nr) (MC.BVValue nr i)) + | otherwise -> TM.customFailure SExpr.InvalidZeroWidthBV + SExpr.A (Register rnum) -> do + -- This generates an 'MC.AssignedValue' + Some val <- SExpr.valueForRegisterNumber rnum + return (SomeTyped (MT.typeRepr val) val) + SExpr.A (AtomName aname) + | Just (Some reg) <- SExpr.asArchRegister as aname -> return (SomeTyped (MT.typeRepr reg) (MC.Initial reg)) + _ -> TM.customFailure (SExpr.InvalidValue at) + +binaryBVApp + :: Keyword + -> (forall n . (1 <= n) => PN.NatRepr n -> MC.Value arch ids (MT.BVType n) -> MC.Value arch ids (MT.BVType n) -> MC.App (MC.Value arch ids) (MT.BVType n)) + -> SExpr.Syntax Atom + -> SExpr.Syntax Atom + -> SExpr.Parser arch ids (SomeTyped (MC.App (MC.Value arch ids))) +binaryBVApp kw con lhs rhs = do + SomeTyped lrep lhs' <- asValue lhs + SomeTyped rrep rhs' <- asValue rhs + case (PC.testEquality lrep rrep, lrep) of + (Just PC.Refl, MT.BVTypeRepr nr) -> return (SomeTyped lrep (con nr lhs' rhs')) + _ -> TM.customFailure (SExpr.InvalidAppArguments kw) + +unaryBVApp + :: Keyword + -> (forall n . (1 <= n) => PN.NatRepr n -> MC.Value arch ids (MT.BVType n) -> MC.App (MC.Value arch ids) (MT.BVType n)) + -> SExpr.Syntax Atom + -> SExpr.Parser arch ids (SomeTyped (MC.App (MC.Value arch ids))) +unaryBVApp kw con op = do + SomeTyped rep op' <- asValue op + case rep of + MT.BVTypeRepr nr -> return (SomeTyped rep (con nr op')) + _ -> TM.customFailure (SExpr.InvalidAppArguments kw) + +binaryBoolApp + :: Keyword + -> MT.TypeRepr tp + -> (MC.Value arch ids tp -> MC.Value arch ids tp -> MC.App (MC.Value arch ids) tp) + -> SExpr.Syntax Atom + -> SExpr.Syntax Atom + -> SExpr.Parser arch ids (SomeTyped (MC.App (MC.Value arch ids))) +binaryBoolApp kw tp con lhs rhs = do + SomeTyped lrep lhs' <- asValue lhs + SomeTyped rrep rhs' <- asValue rhs + case (PC.testEquality tp lrep, PC.testEquality tp rrep) of + (Just PC.Refl, Just PC.Refl) -> return (SomeTyped tp (con lhs' rhs')) + _ -> TM.customFailure (SExpr.InvalidAppArguments kw) + +asApp :: SExpr.Syntax Atom -> SExpr.Parser arch ids (SomeTyped (MC.App (MC.Value arch ids))) +asApp a = + case a of + SExpr.L [ SExpr.A (Keyword BVAdd), lhs, rhs ] -> + binaryBVApp BVAdd MC.BVAdd lhs rhs + SExpr.L [ SExpr.A (Keyword BVSub), lhs, rhs ] -> + binaryBVApp BVSub MC.BVSub lhs rhs + SExpr.L [ SExpr.A (Keyword BVMul), lhs, rhs ] -> + binaryBVApp BVMul MC.BVMul lhs rhs + -- SExpr.L [ SExpr.A (Keyword BVAdc), lhs, rhs ] -> + -- binaryBVApp BVAdc MC.BVAdc lhs rhs + -- SExpr.L [ SExpr.A (Keyword BVSbb), lhs, rhs ] -> + -- binaryBVApp BVSbb MC.BVSbb lhs rhs + SExpr.L [ SExpr.A (Keyword BVAnd), lhs, rhs ] -> + binaryBVApp BVAnd MC.BVAnd lhs rhs + SExpr.L [ SExpr.A (Keyword BVOr), lhs, rhs ] -> + binaryBVApp BVOr MC.BVOr lhs rhs + SExpr.L [ SExpr.A (Keyword BVXor), lhs, rhs ] -> + binaryBVApp BVXor MC.BVXor lhs rhs + SExpr.L [ SExpr.A (Keyword BVShl), lhs, rhs ] -> + binaryBVApp BVShl MC.BVShl lhs rhs + SExpr.L [ SExpr.A (Keyword BVShr), lhs, rhs ] -> + binaryBVApp BVShr MC.BVShr lhs rhs + SExpr.L [ SExpr.A (Keyword BVSar), lhs, rhs ] -> + binaryBVApp BVSar MC.BVSar lhs rhs + SExpr.L [ SExpr.A (Keyword PopCount), op ] -> + unaryBVApp PopCount MC.PopCount op + SExpr.L [ SExpr.A (Keyword Bsf), op ] -> + unaryBVApp Bsf MC.Bsf op + SExpr.L [ SExpr.A (Keyword Bsr), op ] -> + unaryBVApp Bsr MC.Bsr op + SExpr.L [ SExpr.A (Keyword BVComplement), op ] -> + unaryBVApp BVComplement MC.BVComplement op + SExpr.L [ SExpr.A (Keyword Mux), cond, t, f ] -> do + SomeTyped crep cond' <- asValue cond + SomeTyped trep t' <- asValue t + SomeTyped frep f' <- asValue f + case (PC.testEquality trep frep, crep) of + (Just PC.Refl, MT.BoolTypeRepr) -> return (SomeTyped trep (MC.Mux trep cond' t' f')) + _ -> TM.customFailure (SExpr.InvalidAppArguments Mux) + SExpr.L [ SExpr.A (Keyword Eq_), lhs, rhs ] -> do + SomeTyped lrep lhs' <- asValue lhs + SomeTyped rrep rhs' <- asValue rhs + case PC.testEquality lrep rrep of + Just PC.Refl -> return (SomeTyped MT.BoolTypeRepr (MC.Eq lhs' rhs')) + _ -> TM.customFailure (SExpr.InvalidAppArguments Eq_) + SExpr.L [ SExpr.A (Keyword And_), lhs, rhs ] -> + binaryBoolApp And_ MT.BoolTypeRepr MC.AndApp lhs rhs + SExpr.L [ SExpr.A (Keyword Or_), lhs, rhs ] -> + binaryBoolApp Or_ MT.BoolTypeRepr MC.OrApp lhs rhs + SExpr.L [ SExpr.A (Keyword Xor_), lhs, rhs ] -> + binaryBoolApp Xor_ MT.BoolTypeRepr MC.XorApp lhs rhs + SExpr.L [ SExpr.A (Keyword Not_), op ] -> do + SomeTyped rep op' <- asValue op + case rep of + MT.BoolTypeRepr -> return (SomeTyped MT.BoolTypeRepr (MC.NotApp op')) + _ -> TM.customFailure (SExpr.InvalidAppArguments Not_) + + +-- | Parse a single type as a 'MT.TypeRepr' +-- +-- The type forms are: +-- +-- * @Bool@ +-- * @(BV n)@ +-- * @(Vec n ty)@ +asTypeRepr :: SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError (Some MT.TypeRepr) +asTypeRepr at = + case at of + SExpr.A (Keyword Bool_) -> Right (Some MT.BoolTypeRepr) + SExpr.L [SExpr.A (Keyword BV_), SExpr.A (Natural_ w)] + | Just (WidthRepr nr) <- asWidthRepr w -> Right (Some (MT.BVTypeRepr nr)) + | otherwise -> Left SExpr.InvalidZeroWidthBV + SExpr.L [SExpr.A (Keyword Vec_), SExpr.A (Natural_ w), mty] -> + case PN.mkNatRepr w of + Some nr -> + -- Note that zero-width vectors are technically allowed + case asTypeRepr mty of + Right (Some ty) -> Right (Some (MT.VecTypeRepr nr ty)) + Left _errs -> Left (SExpr.InvalidVectorPayload mty) + _ -> Left (SExpr.InvalidType at) + +-- | Parse the right-hand side of an assignment +-- +-- Note that it is important that the EvalApp case is last, as there are many +-- syntactic forms that we might need to dispatch to. +asRHS + :: forall arch ids + . SExpr.Syntax Atom + -> SExpr.Parser arch ids (Some (MC.AssignRhs arch (MC.Value arch ids))) +asRHS a = + case a of + SExpr.L [ SExpr.A (Keyword Undefined), srep ] -> do + Some rep <- parseSExpr srep asTypeRepr + return (Some (MC.SetUndefined rep)) + SExpr.L [ SExpr.A (Keyword ReadMemory), saddr, smemRepr ] -> do + SomeTyped addrTp addr <- asValue saddr + SomeTyped _rep memRepr <- parseSExpr smemRepr asMemRepr + + memWidth <- SExpr.archMemWidth + let addrRepr = MT.BVTypeRepr memWidth + + case PC.testEquality addrTp addrRepr of + Just PC.Refl -> return (Some (MC.ReadMem addr memRepr)) + Nothing -> TM.customFailure (SExpr.InvalidAddressWidth a) + _ -> do + SomeTyped _tp app <- asApp a + return (Some (MC.EvalApp app)) + +-- | Forms: +-- +-- * @(comment "msg")@ +-- * @(instruction-start addr decoded-asm-text)@ +-- * @(write-memory addr mem-rep value)@ +-- * @(cond-write-memory cond addr mem-rep value)@ +-- * @(reg := rhs)@ +parseStmt :: forall arch ids . SExpr.Parser arch ids (MC.Stmt arch ids) +parseStmt = do + at <- SExpr.sexp parseAtom + case at of + SExpr.L [ SExpr.A (Keyword Comment), SExpr.A (String_ txt) ] -> + return (MC.Comment txt) + SExpr.L [ SExpr.A (Keyword InstructionStart), SExpr.A (Address addr), SExpr.A (String_ txt) ] -> + return (MC.InstructionStart (MM.memWord (fromIntegral addr)) txt) + SExpr.L [ SExpr.A (Keyword WriteMemory), stargetAddr, smemRepr, svalue ] -> do + SomeTyped addrTy addr <- asValue stargetAddr + SomeTyped vtp value <- asValue svalue + SomeTyped mtp memRepr <- parseSExpr smemRepr asMemRepr + memWidth <- SExpr.archMemWidth + let addrRepr = MT.BVTypeRepr memWidth + case (PC.testEquality vtp mtp, PC.testEquality addrTy addrRepr) of + (Just PC.Refl, Just PC.Refl) -> do + return (MC.WriteMem addr memRepr value) + -- FIXME: Make a more-specific error + _ -> TM.customFailure (SExpr.InvalidStatement at) + SExpr.L [ SExpr.A (Keyword CondWriteMemory), scond, stargetAddr, smemRepr, svalue ] -> do + SomeTyped condTy cond <- asValue scond + SomeTyped addrTy addr <- asValue stargetAddr + SomeTyped vtp value <- asValue svalue + SomeTyped mtp memRepr <- parseSExpr smemRepr asMemRepr + memWidth <- SExpr.archMemWidth + let addrRepr = MT.BVTypeRepr memWidth + case (PC.testEquality vtp mtp, PC.testEquality addrTy addrRepr, PC.testEquality condTy MT.BoolTypeRepr) of + (Just PC.Refl, Just PC.Refl, Just PC.Refl) -> do + return (MC.CondWriteMem cond addr memRepr value) + -- FIXME: Make a more-specific error + _ -> TM.customFailure (SExpr.InvalidStatement at) + SExpr.L [ SExpr.A (Register r), SExpr.A (Keyword Assign), srhs ] -> do + Some rhs <- asRHS srhs + Some asgn <- SExpr.defineRegister r rhs + return (MC.AssignStmt asgn) + _ -> TM.customFailure (SExpr.InvalidStatement at) + +parseBlock :: SExpr.Parser arch ids (Parsed.ParsedBlock arch ids) +parseBlock = do + + stmts <- TM.many parseStmt + + return Parsed.ParsedBlock { Parsed.pblockAddr = undefined + , Parsed.pblockPrecond = undefined + , Parsed.blockSize = undefined + , Parsed.blockReason = undefined + , Parsed.blockAbstractState = undefined + , Parsed.blockJumpBounds = undefined + , Parsed.pblockStmts = stmts + , Parsed.pblockTermStmt = undefined + } + +parseFunction :: SExpr.Parser arch ids (Some (MD.DiscoveryFunInfo arch)) +parseFunction = do + + blocks <- TM.many parseBlock + + let dfi = MDS.DiscoveryFunInfo { MDS.discoveredFunReason = undefined + , MDS.discoveredFunAddr = undefined + , MDS.discoveredFunSymbol = undefined + , MDS._parsedBlocks = Map.fromList [ (Parsed.pblockAddr pb, pb) + | pb <- blocks + ] + , MDS.discoveredClassifyFailureResolutions = undefined + } + return (Some dfi) + +parseDiscoveryFunInfo + :: (MC.ArchConstraints arch) + => SExpr.ArchSyntax arch + -- ^ Architecture-specific parsers + -> MM.Memory (MC.ArchAddrWidth arch) + -- ^ The memory of the binary we are loading results for + -> BS.ByteString + -- ^ The bytes of the persisted file + -> Either SExpr.MacawParseError (Some (MD.DiscoveryFunInfo arch)) +parseDiscoveryFunInfo as mem bytes = SExpr.runParser as mem bytes parseFunction diff --git a/base/src/Data/Macaw/Syntax/SExpr.hs b/base/src/Data/Macaw/Syntax/SExpr.hs new file mode 100644 index 00000000..9147c869 --- /dev/null +++ b/base/src/Data/Macaw/Syntax/SExpr.hs @@ -0,0 +1,245 @@ +{-# LANGUAGE DataKinds #-} +{-# LANGUAGE DeriveFunctor #-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FunctionalDependencies #-} +{-# LANGUAGE GADTs #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE KindSignatures #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE PatternSynonyms #-} +{-# LANGUAGE RankNTypes #-} +{-# LANGUAGE StandaloneDeriving #-} +{-# LANGUAGE TypeApplications #-} +{-# LANGUAGE ViewPatterns #-} +module Data.Macaw.Syntax.SExpr ( + Position(..) + , Positioned(..) + , Syntax(..) + , Datum(..) + , Layer(..) + , Parser + , runParser + , freshNonce + , MacawParseError(..) + , MacawSyntaxError(..) + , memory + , archMemWidth + , valueForRegisterNumber + , defineRegister + , ArchSyntax(..) + , getArchSyntax + , Syntactic(..) + , pattern L + , pattern A + , sexp + , lexeme + , symbol + , syntaxPos + , withPosFrom + ) where + +import Control.Applicative ( empty, (<|>), Alternative ) +import Control.Monad ( MonadPlus ) +import qualified Control.Monad.RWS as RWS +import qualified Control.Monad.Reader as CMR +import Control.Monad.ST ( ST ) +import qualified Control.Monad.State as CMS +import Control.Monad.Trans ( lift ) +import qualified Data.ByteString as BS +import qualified Data.Map as Map +import qualified Data.Parameterized.NatRepr as PN +import qualified Data.Parameterized.Nonce as PN +import Data.Parameterized.Some ( Some(..) ) +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import qualified Data.Text.Encoding.Error as TEE +import Numeric.Natural ( Natural ) +import qualified Text.Megaparsec as TM +import qualified Text.Megaparsec.Char as TMC +import qualified Text.Megaparsec.Char.Lexer as TMCL + +import qualified Data.Macaw.CFG as MC +import qualified Data.Macaw.Memory as MM +import Data.Macaw.Syntax.Atom ( Atom, AtomName, Keyword ) +import qualified Data.Macaw.Types as MT + +data Position = Position { file :: FilePath, line :: {-# UNPACK #-} !TM.Pos, column :: {-# UNPACK #-} !TM.Pos } + deriving (Eq, Ord, Show) + +data Positioned a = Positioned { position :: {-# UNPACK #-} !Position, positioned :: !a } + deriving (Eq, Functor, Ord, Show) + +data Layer f a = List [f a] | Atom !a + deriving (Show, Functor, Eq, Ord) + +newtype Syntax a = Syntax { unSyntax :: Positioned (Layer Syntax a) } + deriving (Show, Functor, Eq, Ord) + +newtype Datum a = Datum { unDatum :: Layer Datum a } + deriving (Show, Functor, Eq) + +class Syntactic a b | a -> b where + syntaxE :: a -> Layer Syntax b + +instance Syntactic (Layer Syntax a) a where + syntaxE = id + +instance Syntactic (Syntax a) a where + syntaxE = positioned . unSyntax + +pattern A :: Syntactic a b => b -> a +pattern A x <- (syntaxE -> Atom x) + +pattern L :: Syntactic a b => [Syntax b] -> a +pattern L xs <- (syntaxE -> List xs) + +syntaxPos :: Syntax a -> Position +syntaxPos = position . unSyntax + +withPosFrom :: Syntax a -> b -> Positioned b +withPosFrom stx x = Positioned (syntaxPos stx) x + +-- | Low-level syntax errors embedded in megaparsec parse errors +data MacawSyntaxError where + InvalidZeroWidthBV :: MacawSyntaxError + InvalidVectorPayload :: Syntax Atom -> MacawSyntaxError + InvalidType :: Syntax Atom -> MacawSyntaxError + InvalidStatement :: Syntax Atom -> MacawSyntaxError + InvalidEndianness :: Syntax Atom -> MacawSyntaxError + MissingRegisterDefinition :: Natural -> MacawSyntaxError + DuplicateRegisterDefinition :: Natural -> MacawSyntaxError + InvalidAddressWidth :: Syntax Atom -> MacawSyntaxError + InvalidValue :: Syntax Atom -> MacawSyntaxError + InvalidAppArguments :: Keyword -> MacawSyntaxError + +deriving instance Eq MacawSyntaxError +deriving instance Ord MacawSyntaxError + +data ArchSyntax arch = + ArchSyntax { asArchRegister :: AtomName -> Maybe (Some (MC.ArchReg arch)) + } + +data ParserContext arch ids = + ParserContext { memory :: MM.Memory (MC.ArchAddrWidth arch) + , nonceGen :: PN.NonceGenerator (ST ids) ids + , archSyntax :: ArchSyntax arch + } + +data ParserState arch ids = + ParserState { idMap :: Map.Map Natural (Some (MC.Assignment arch ids)) + } + +newtype ParserM arch ids a = ParserM { runParserM :: TM.ParsecT MacawSyntaxError T.Text (RWS.RWST (ParserContext arch ids) () (ParserState arch ids) (ST ids)) a } + deriving ( Functor + , Applicative + , Alternative + , Monad + , MonadPlus + , TM.MonadParsec MacawSyntaxError T.Text + , CMR.MonadReader (ParserContext arch ids) + , CMS.MonadState (ParserState arch ids) + ) + +type Parser arch ids a = (MC.ArchConstraints arch) => ParserM arch ids a + +liftST :: ST ids a -> Parser arch ids a +liftST = ParserM . lift . lift + +freshNonce :: forall arch ids (tp :: MT.Type) . Parser arch ids (PN.Nonce ids tp) +freshNonce = do + ng <- CMR.asks nonceGen + liftST $ PN.freshNonce ng + +getArchSyntax :: Parser arch ids (ArchSyntax arch) +getArchSyntax = CMR.asks archSyntax + +-- | Look up the value corresponding to a register number in the current +-- function; note that this assumes that definitions precede uses (and throws a +-- parse error if that is not true) +valueForRegisterNumber :: Natural -> Parser arch ids (Some (MC.Value arch ids)) +valueForRegisterNumber rnum = do + ids <- CMS.gets idMap + case Map.lookup rnum ids of + Just (Some asgn) -> return (Some (MC.AssignedValue asgn)) + Nothing -> TM.customFailure (MissingRegisterDefinition rnum) + +-- | Define a new register to a value +-- +-- This corresponds to the 'MC.AssignStmt' form +defineRegister + :: forall arch ids (tp :: MT.Type) + . Natural + -> MC.AssignRhs arch (MC.Value arch ids) tp + -> Parser arch ids (Some (MC.Assignment arch ids)) +defineRegister rnum rhs = do + ids <- CMS.gets idMap + case Map.lookup rnum ids of + Just _ -> TM.customFailure (DuplicateRegisterDefinition rnum) + Nothing -> do + rnonce <- freshNonce @arch @ids @tp + let asgn = MC.Assignment (MC.AssignId rnonce) rhs + CMS.modify' $ \s -> s { idMap = Map.insert rnum (Some asgn) (idMap s) } + return (Some asgn) + +archMemWidth :: Parser arch ids (PN.NatRepr (MC.ArchAddrWidth arch)) +archMemWidth = CMR.asks (MM.memWidth . memory) + +-- | Top-level parse errors returned by the parser +-- +-- This includes megaparsec parse errors (see 'MacawSyntaxError') +data MacawParseError where + InvalidUTF8 :: TEE.UnicodeException -> BS.ByteString -> MacawParseError + ParseError :: TM.ParseErrorBundle T.Text MacawSyntaxError -> MacawParseError + +runParser + :: ArchSyntax arch + -> MM.Memory (MC.ArchAddrWidth arch) + -> BS.ByteString + -> (forall ids . ParserM arch ids a) + -> Either MacawParseError a +runParser as mem bytes p = PN.runSTNonceGenerator $ \ng -> do + case TE.decodeUtf8' bytes of + Left decodeErr -> return (Left (InvalidUTF8 decodeErr bytes)) + Right txt -> do + let ctx = ParserContext { memory = mem + , nonceGen = ng + , archSyntax = as + } + let st0 = ParserState { idMap = mempty } + (res, _) <- RWS.evalRWST (TM.runParserT (runParserM p) "" txt) ctx st0 + case res of + Left err -> return (Left (ParseError err)) + Right a -> return (Right a) + +-- | There are no comments supported in the macaw concrete syntax for now +skipWhitespace :: Parser arch ids () +skipWhitespace = TMCL.space TMC.space1 empty empty + +lexeme :: Parser arch ids a -> Parser arch ids a +lexeme = TMCL.lexeme skipWhitespace + +withPos :: Parser arch ids a -> Parser arch ids (Positioned a) +withPos p = do + TM.SourcePos fl ln col <- TM.getSourcePos + let loc = Position fl ln col + res <- p + return $! Positioned { position = loc + , positioned = res + } + +symbol :: T.Text -> Parser arch ids T.Text +symbol = TMCL.symbol skipWhitespace + +list :: Parser arch ids (Syntax a) -> Parser arch ids (Syntax a) +list p = do + Positioned loc _ <- withPos (symbol "(") + xs <- TM.many p + _ <- lexeme (symbol ")") + return $! Syntax (Positioned loc (List xs)) + +sexp :: Parser arch ids a -> Parser arch ids (Syntax a) +sexp atom = + (Syntax . fmap Atom <$> lexeme (withPos atom)) <|> + list (sexp atom)