I don’t really enjoy writing parsers. Moreover, parsers and pretty-printing are not exactly related to working through Paulson’s paper, but one way or another, I’ll have to do it to get a working version of the theorem prover. So let’s get started, the time has come.
Refactoring Link to heading
Before working on lexing, parsing and pretty-printing, let’s refactor our code to introduce more explicit types for logical connectives and quantifiers. We’ll start by defining simple sum-types for connective
and quantifier
:
type connective = Conj | Disj | Impl | Iff | Not [@@deriving eq, show]
type quantifier = Forall | Exists [@@deriving eq, show]
Then we want to update the formula type to make use of these new definitions:
(* formula.ml *)
(* ... *)
type t =
| Pred of string * Term.t list
| Conn of connective * t list
| Quant of quantifier * string * t
(* ... *)
And update our tests to use the new types. Here’s a single test as an example:
(* formula.ml *)
(* ... *)
let%test "abstract with nested formula" =
let term = Term.Var "x" in
let formula =
Conn
( Conj, (* instead of "&" *)
[
Pred ("P", [ Term.Var "x" ]);
Quant (Forall, "y", Pred ("Q", [ Term.Var "x" ])); (* instead of "forall" *)
] )
in
let expected =
Conn
( Conj,
[
Pred ("P", [ Term.Bound 0 ]);
Quant (Forall, "y", Pred ("Q", [ Term.Bound 1 ]));
] )
in
expected = abstract term formula
(* ... *)
We can now begin working on parsing formulas.
Parsing Link to heading
We’ll focus on smth that can parse a single formula. By the end of this section, we should have a working example of parsing formulas with quantifiers, connectives, and terms.
Before we can parse the input, we need to tokenize it. The lexer is responsible for reading an input stream and converting it into a series of tokens. Here, we’ll define various tokens that represent different elements of our formula language, such as connectives (AND
, OR
, etc.), quantifiers (\(\forall\), \(\exists\)), and other syntactic elements like parentheses and commas.
(* lexer.mll *)
{
open Lexing
open Parser
exception LexingError of string
}
(* Regular expressions *)
let whitespaces = [' ' '\t']+
let newline = '\r' | '\n' | "\r\n"
let digit = ['0'-'9']
let uint = digit+
let string = ['a'-'z' 'A'-'Z']+
(* Lexing rules / Productions *)
rule read = parse
| whitespaces { read lexbuf }
| newline { new_line lexbuf; read lexbuf }
| "(" { LPAREN }
| ")" { RPAREN }
| "." { DOT }
| "," { COMMA }
| "?" { QUESTION }
| "|-" { TURNSTYLE }
| "∧" | "&" { CONJ }
| "∨" | "|" { DISJ }
| "→" | "-->" { IMPL }
| "↔" | "<->" { IFF }
| "¬" | "~" { NOT }
| "∀" | "FORALL" { FORALL }
| "∃" | "EXISTS" { EXISTS }
| uint as ix { IX(int_of_string ix) }
| string as str { STRING(str) }
| _ { raise (LexingError ("Unexpected character: " ^ Lexing.lexeme lexbuf)) }
| eof { EOF }
The regular expressions match parts of the input string (e.g., digits for indices, strings for variable names), and the lexer generates corresponding tokens.
Once we have the tokens, the next step is to parse them into a structured representation. Here is a simple parser that can handle formulas with quantifiers and logical connectives:
(* parser.mly *)
%token <int> IX
%token <string> STRING "string"
%token LPAREN "("
%token RPAREN ")"
%token DOT "."
%token COMMA ","
%token QUESTION "?"
%token TURNSTYLE "|-"
%token CONJ "∧" (* "&" *)
%token DISJ "∨" (* "|" *)
%token IMPL "→" (* "-->" *)
%token IFF "↔" (* "<->" *)
%token NOT "¬" (* "~" *)
%token FORALL "∀" (* "FORALL" *)
%token EXISTS "∃" (* "EXISTS" *)
%token EOF
%left IFF
%left IMPL
%left DISJ
%left CONJ
%right NOT
%start <Formula.t> main
%{
open Formula
open Term
%}
%%
main:
| formula EOF { $1 }
;
formula:
| q_formula { $1 }
| l_formula { $1 }
;
q_formula:
| FORALL STRING DOT formula { Quant(Forall, $2, $4) }
| EXISTS STRING DOT formula { Quant(Exists, $2, $4) }
;
l_formula:
| l_formula IMPL l_formula { Conn(Impl, [$1; $3]) }
| l_formula CONJ l_formula { Conn(Conj, [$1; $3]) }
| l_formula DISJ l_formula { Conn(Disj, [$1; $3]) }
| l_formula IFF l_formula { Conn(Iff, [$1; $3]) }
| NOT l_formula { Conn(Not, [$2]) }
| STRING "(" terms ")" { Pred($1, $3) }
| LPAREN formula RPAREN { $2 }
;
terms:
| term { [$1] }
| term COMMA terms { $1 :: $3 }
;
term:
| STRING { Var($1) }
| QUESTION STRING { Param($2, []) } (* Meta-variable *)
| IX { Bound($1) }
| STRING "(" terms ")" { Function($1, $3) }
;
main
– the starting point of the parser, which expects aformula
followed by theEOF
token.formula
– this can either be a quantified formula (q_formula
) or a logical formula (l_formula
).q_formula
– handles formulas with quantifiers (FORALL
orEXISTS
).l_formula
– deals with logical connectives as well as predicates and parenthesized subformulas.terms
– a list of terms (comma separated).term
– defines how terms are structured. A term can be a variable, a meta-variable, an indexed term, or a function application.
The TURNSTYLE
token isn’t currently in use, but let’s keep it since we’ll need it later.
To learn more about lexing and parsing in OCaml, check out Parsing with OCamllex and Menhir.
Pretty-printing Link to heading
To format formulas and terms in OCaml we’ll using the Format module, which provides tools for pretty-printing formatted text. The two primary functions for formatting terms are:
pp_term
– Formats a term using theFormat.formatter
.term_to_string
– Converts a term to its string representation.
(* term.mli *)
(* ... *)
val pp_term : Format.formatter -> t -> unit
(**
Prints the term [t] using the format output function [fmt].
@param fmt The formatter to which the formatted term is output.
@param t The term to be formatted.
*)
val term_to_string : t -> string
(**
Prints a term.
@param term The term to convert to a string.
@return A string representing the term.
*)
(* term.ml *)
(* ... *)
let rec pp_term fmt = function
| Var x -> Format.fprintf fmt "%s" x
| Param (name, vars) ->
Format.fprintf fmt "%s(%a)" name
(fun fmt vars ->
Format.open_hvbox 0;
Format.pp_print_list ~pp_sep:pp_comma Format.pp_print_string fmt vars;
Format.close_box ())
vars
| Bound ix -> Format.fprintf fmt "%d" ix
| Function (name, args) ->
Format.fprintf fmt "%s(%a)" name
(fun fmt args ->
Format.open_hvbox 2;
Format.pp_print_list ~pp_sep:pp_comma pp_term fmt args;
Format.close_box ())
args
let term_to_string term = format_to_string pp_term term
(* ... *)
Pretty-printing formulas works similarly but includes additional helpers for printing connectives and quantifiers.
(* formula.mli *)
(* ... *)
val pp_formula : Format.formatter -> t -> unit
(**
Prints a formula using the given [fmt] formatter.
@param fmt The formatter used to print the formula.
@param formula The formula to format.
*)
val formula_to_string : t -> string
(**
Prints a formula.
@param formula The formula to convert to a string.
@return A string representing the formula.
*)
Helper functions:
(* formula.ml *)
(* ... *)
let pp_connective fmt = function
| Conj -> Format.fprintf fmt "∧"
| Disj -> Format.fprintf fmt "∨"
| Impl -> Format.fprintf fmt "→"
| Iff -> Format.fprintf fmt "↔"
| Not -> Format.fprintf fmt "¬"
let pp_quantifier fmt = function
| Forall -> Format.fprintf fmt "∀"
| Exists -> Format.fprintf fmt "∃"
(* ... *)
The implementation:
(* formula.ml *)
(* ... *)
let is_pred = function Pred _ -> true | _ -> false
let rec pp_formula fmt = function
| Pred (name, terms) ->
Format.fprintf fmt "%s(%a)" name
(Format.pp_print_list ~pp_sep:pp_comma Term.pp_term)
terms
| Conn (connective, subformulas) -> (
match connective with
| Not -> pp_not fmt (List.hd_exn subformulas)
| conn ->
Format.open_vbox 0;
(Format.pp_print_list
~pp_sep:(fun fmt () ->
Format.pp_print_space fmt ();
pp_connective fmt conn;
Format.pp_print_space fmt ())
pp_formula)
fmt subformulas;
Format.close_box ())
| Quant (quantifier, var, body) ->
Format.open_hovbox 2;
Format.fprintf fmt "%a%s." pp_quantifier quantifier var;
if is_pred body then pp_formula fmt body
else Format.fprintf fmt "(%a)" (fun fmt body -> pp_formula fmt body) body;
Format.close_box ()
(* formats a negation of a formula *)
and pp_not fmt subformula =
pp_connective fmt Not;
Format.open_box 0;
pp_formula fmt subformula;
Format.close_box ()
let formula_to_string = format_to_string pp_formula
(* ... *)
The is_pred
helps determine if the body of a quantified formula is a predicate. In the code snippet above, the body of a quantified formula (Quant
) is printed in one of two ways:
- If the body is a predicate (
Pred
), we print it directly usingpp_formula fmt body
. - If the body is not a predicate, we wrap it in parentheses using
Format.fprintf fmt "(%a)"
.
The Pretty_print
module contains helper functions for the common formatting logic:
(* pretty_print.mli *)
val pp_comma : Format.formatter -> unit -> unit
(**
Print a comma, typically used for separating elements in a list or similar structures.
*)
val format_to_string : (Format.formatter -> 'a -> unit) -> 'a -> string
(**
[format_to_string pp v] formats a value [v] using the provided pretty-printing function [pp],
and returns the result as a string. This function handles buffer management and ensures that
the formatted value is properly flushed to the string.
- [pp] is a pretty-printing function that takes a [Format.formatter] and a value of type ['a] and prints it.
- [v] is the value to be formatted.
Returns the string representation of the formatted value.
*)
The implementation is nothing special:
(* pretty_print.ml *)
let pp_comma ppf () = Format.fprintf ppf ", "
let format_to_string pp v =
let open Format in
let buffer = Buffer.create 16 in
let fmt = formatter_of_buffer buffer in
pp_open_hvbox fmt 0;
pp fmt v;
pp_close_box fmt ();
pp_print_flush fmt ();
Buffer.contents buffer
I added some inline tests, which can be found in the repository.
That’s all for parsing and pretty-printing.
Now we can finally get to the fun part of Paulson’s paper – unification!