ubgpsuite/lonetix/include/df/lexer.h

// SPDX-License-Identifier: LGPL-3.0-or-later

/**
 * \file lexer.h
 *
 * C-compliant non-allocating UTF-8 text lexer.
 *
 * \author Lorenzo Cogotti
 * \copyright The DoubleFourteen Code Forge (C) All Rights Reserved
 */

#ifndef DF_LEXER_H_
#define DF_LEXER_H_

#include "utf/utfdef.h"

/// Maximum allowed token length inside text parsed by `Lex`.
#define MAXTOKLEN 256

/// String token type
#define TT_STRING  U16_C(1)
/// Literal token type
#define TT_LITERAL U16_C(2)
/// Numeric token type
#define TT_NUMBER  U16_C(3)
/// Token type for names or identifiers
#define TT_NAME    U16_C(4)
/// Punctuation token type
#define TT_PUNCT   U16_C(5)

/**
 * Token subtype flags for `TT_NUMBER`
 *
 * @{
 */

#define TT_INT         BIT(0)   ///< integer
#define TT_DEC         BIT(1)   ///< decimal number
#define TT_HEX         BIT(2)   ///< hexadecimal number
#define TT_OCT         BIT(3)   ///< octal number
#define TT_BIN         BIT(4)   ///< binary number
#define TT_LONG        BIT(5)   ///< long int
#define TT_LLONG       BIT(6)   ///< long long int
#define TT_UNSIGNED    BIT(7)   ///< unsigned int
#define TT_FLOAT       BIT(8)   ///< floating point number
#define TT_SINGLE_PREC BIT(9)   ///< float
#define TT_DOUBLE_PREC BIT(10)  ///< double
#define TT_EXT_PREC    BIT(11)  ///< long double
#define TT_INF         BIT(12)  ///< infinite 1.#INF
#define TT_INDEF       BIT(13)  ///< indefinite 1.#IND
#define TT_NAN         BIT(14)  ///< NaN
#define TT_IPADDR      BIT(15)  ///< ip address (address may still be ill-formed, e.g. `102948.22.999.1`)
#define TT_IPV4        BIT(16)  ///< ipv4 address format
#define TT_IPV6        BIT(17)  ///< ipv6 address format
#define TT_IPV6LIT     BIT(18)  ///< ipv6 address is expressed as literal (e.g. `[2001:db8:a::123]`)
#define TT_IPV6ZONE    BIT(19)  ///< ipv6 address contains a zone index/string (e.g. `fe80::1ff:fe23:4567:890a%3`)
#define TT_IPPORT      BIT(20)  ///< ip address includes a port

/** @} */

/**
 * Token flags
 *
 * @{
 */

/// Indicates `Tok` originally exceeded `MAXTOKLEN` and was consequently truncated.
#define TT_TRUNC BIT(15)

/** @} */

/// Lexer punctuation token descriptor (text -> token `subtype`).
typedef struct Punctuation Punctuation;
struct Punctuation {
	const char *p;   ///< NULL for last element in punctuation list.
	Uint32      id;  ///< Puntuation identifier (returned in `Tok->subtype`)
};

// punctuation ids
#define P_RSHIFT_ASSIGN 1
#define P_LSHIFT_ASSIGN 2
#define P_PARMS         3
#define P_PRECOMPMERGE  4

#define P_LOGIC_AND  5
#define P_LOGIC_OR   6
#define P_LOGIC_GEQ  7
#define P_LOGIC_LEQ  8
#define P_LOGIC_EQ   9
#define P_LOGIC_UNEQ 10

#define P_MUL_ASSIGN 11
#define P_DIV_ASSIGN 12
#define P_MOD_ASSIGN 13
#define P_ADD_ASSIGN 14
#define P_SUB_ASSIGN 15
#define P_INC        16
#define P_DEC        17

#define P_BIN_AND_ASSIGN 18
#define P_BIN_OR_ASSIGN  19
#define P_BIN_XOR_ASSIGN 20
#define P_RSHIFT         21
#define P_LSHIFT         22

#define P_POINTERREF 23
#define P_MUL        24
#define P_DIV        25
#define P_MOD        26
#define P_ADD        27
#define P_SUB        28
#define P_ASSIGN     29

#define P_BIN_AND 30
#define P_BIN_OR  31
#define P_BIN_XOR 32
#define P_BIN_NOT 33

#define P_LOGIC_NOT     34
#define P_LOGIC_GREATER 35
#define P_LOGIC_LESS    36

#define P_REF          37
#define P_COMMA        38
#define P_SEMICOLON    39
#define P_COLON        40
#define P_QUESTIONMARK 41

#define P_PARENOPEN      42
#define P_PARENCLOSE     43
#define P_BRACEOPEN      44
#define P_BRACECLOSE     45
#define P_SQBRACKETOPEN  46
#define P_SQBRACKETCLOSE 47
#define P_BACKSLASH      48

#define P_PRECOMP 49
#define P_DOLLAR  50

/**
 * \brief Token returned by `Lex`.
 *
 * Contains token text and information.
 */
typedef struct Tok Tok;
struct Tok {
	Uint16 type;
	Uint16 flags;
	Uint32 subtype;

	unsigned linesCrossed;
	unsigned spacesBeforeToken;
	unsigned line;

	long long intvalue;
	double    floatvalue;

	Tok *nextToken;

	char text[MAXTOKLEN];  // NOTE: last element to allow partial allocation
};

/// Disregard lexer errors
#define L_NOERR  BIT(0)
/// Disregard lexer warnings
#define L_NOWARN BIT(1)
/// Disregard both errors and warnings
#define L_QUIET (L_NOERR | L_NOWARN)
/// Use console colors when reporting errors and warnings
#define L_COLORED BIT(2)
/// Parse all tokens as strings, instead of breaking them using full-fledged C rules
#define L_STRONLY BIT(3)
/// Allow file paths within tokens
#define L_ALLOWPATHS BIT(4)
/// Do not allow escapes within strings
#define L_NOSTRESC BIT(5)
/// Do not concatenate consecutive strings
#define L_NOSTRCAT BIT(6)
/// Concatenate strings separated by a backslash+newline
#define L_ALLOWBACKSLASHSTRCAT BIT(7)
/// Allow multichar literals
#define L_ALLOWMULTICHARLIT    BIT(8)
/// Accepts IP addresses (parsed as `TT_NUMBER`)
#define L_ALLOWIPADDR          BIT(9)
/// IP addresses with port numbers, IPv6 literals or zone ids won't be accepted,
/// only meaningful if used with `L_ALLOWIPADDR`.
#define L_PLAINIPADDRONLY      BIT(10)
/// Allow special floating point exception tokens (0.#INF, 0.#IND).
#define L_ALLOWFLOATEXC        BIT(10)
/// Allow truncating tokens exceeding `MAXTOKLEN`.
#define L_ALLOWTRUNC           BIT(11)
/// Do not search base `#include` paths (used by PC library).
#define L_NOBASEINCLUDES       BIT(12)

/// Special callback, invokes immediate program termination after reporting a lexer message
#define LEX_QUIT  ((void (*)(Lex *, const char *, void *)) -1)
/// Special callback, makes the lexer ignore the the warning or error
/// (same behavior as `L_NOERR` and `L_NOWARN`, but as an explicit callback).
#define LEX_IGN   ((void (*)(Lex *, const char *, void *))  0)
/// Special callback, makes the lexer print an error or warning message to `stderr`,
/// doesn't terminate execution.
#define LEX_WARN  ((void (*)(Lex *, const char *, void *))  1)

/**
 * \brief A lexer, breaks text into single tokens, keeping track of the current position.
 *
 * \note This struct should be considered opaque.
 */
typedef struct Lex Lex;
struct Lex {
	char    *pos, *lim;
	unsigned line;
	Uint16   flags;
	Boolean8 hasError;
	Boolean8 hasBufferedToken;
	Rune     nr;

	const Punctuation *puncts;

	void  *obj;
	void (*Error)(Lex *, const char *, void *);
	void (*Warn)(Lex *, const char *, void *);

	Lex *nextLexer;

	Tok  buf;
	char name[MAXTOKLEN];
};

/// Register callbacks for lexer warning and error triggers.
FORCE_INLINE void SetLexerErrorFunc(Lex   *p,
                                    void (*errf)(Lex *, const char *, void *),
                                    void (*warnf)(Lex *, const char *, void *),
                                    void  *obj)
{
	p->Error = errf;
	p->Warn  = warnf;
	p->obj   = obj;
}

/**
 * \brief Set parsing session name and initial line number.
 *
 * \param [out] p    A lexer, must not be `NULL`
 * \param [in]  name Name for this parsing session
 * \param [in]  line Initial line number, 0 is implicitly changed to 1
 */
void BeginLexerSession(Lex *p, const char *name, unsigned line);

/**
 * \brief Setup lexer to parse text, sized.
 *
 * \param [out] p    A lexer, must not be `NULL`
 * \param [in]  text Text to be parsed, must have at least `n` chars
 * \param [in]  n    Number of chars in `text`
 */
void SetLexerTextn(Lex *p, const char *text, size_t n);

/**
 * \brief Setup lexer to parse text.
 *
 * \param [out] p    A lexer, must not be `NULL`
 * \param [in]  text `NUL` terminated text to be parsed
 */
FORCE_INLINE void SetLexerText(Lex *p, const char *text)
{
	EXTERNC size_t strlen(const char *);

	SetLexerTextn(p, text, strlen(text));
}

/**
 * \brief Change lexer flags.
 *
 * \param [out] p     A lexer, must not be `NULL`
 * \param [in]  flags New flags for the lexer
 */
FORCE_INLINE void SetLexerFlags(Lex *p, unsigned flags)
{
	p->flags = flags;
}

/// Retrieve current lexer flags.
FORCE_INLINE unsigned GetLexerFlags(Lex *p)
{
	return p->flags;
}

/// Trigger an error over a lexer.
CHECK_PRINTF(2, 3) void LexerError(Lex *p, const char *fmt, ...);
/**
 * Trigger a warning over a lexer.
 */
CHECK_PRINTF(2, 3) void LexerWarning(Lex *p, const char *fmt, ...);

/// Test whether a lexer reached the end.
FORCE_INLINE Boolean IsLexerEndOfFile(Lex *p)
{
	return (p->pos >= p->lim || *p->pos == '\0') && !p->hasBufferedToken;
}

/// Test whether a lexer encountered an error.
FORCE_INLINE Boolean HasLexerError(Lex *p)
{
	return p->hasError;
}

/**
 * \brief Read and return next token.
 *
 * \param [in,out] p    A lexer, must not be `NULL`
 * \param [out]    dest Storage for the returned token, must not be `NULL`
 *
 * \return If a new token has been read, then `tok->text` is returned,
 *         `NULL` is returned if a parsing error has been encountered,
 *         or no more tokens are available.
 */
char *Lex_ReadToken(Lex *p, Tok *dest);
/**
 * \brief Read and return next token in the same line.
 *
 * This is a variant of `Lex_ReadToken()` useful to implement
 * a C Preprocessor, it avoids parsing spanning more than one line.
 * `\` followed by a newline is recognized and treated as a regular
 * space.
 */
char *Lex_ReadTokenOnLine(Lex *p, Tok *dest);
/**
 * \brief Expects an integral token, reading and returning its value.
 *
 * \param [in,out] p            A lexer, must not be `NULL`
 * \param [in]     optionalSign Allow an optional `+` or `-` sign before the
 *                              token, if set to `FALSE` only unsigned
 *                              integers are allowed.
 *
 * \return The token value, 0 on error, use `HasLexerError()` to distinguish
 *         between actual 0 and error value.
 */
long long Lex_ParseInt(Lex *p, Boolean optionalSign);
/**
 * \brief Expects a boolean token, reading and returning its value.
 *
 * \param [in,out] p            A lexer, must not be `NULL`
 * \param [in]     allowNumeric Convert numeric values to booleans, 0 for
 *                              `FALSE`, any other numeric value for `TRUE`
 *
 * \return The boolean value, `FALSE` on error or end of file,
 *         use `HasLexerError()` or `IsLexerEndOfFile()` to distinguish
 *         between actual `FALSE` and error value.
 */
Boolean Lex_ParseBool(Lex *p, Boolean allowNumeric);
/**
 * \brief Expects a floating point token, reading and returning its value.
 *
 * \param [in,out] p            A lexer, must not be `NULL`
 * \param [in]     optionalSign Allow an optional `+` or `-` sign before the
 *                              token, if set to `FALSE` only non-negative
 *                              values are allowed.
 *
 * \return The float value, 0 on error or end of file,
 *         use `HasLexerError()` or `IsLexerEndOfFile()` to distinguish
 *         between actual 0 and error value.
 */
double Lex_ParseFloat(Lex *p, Boolean optionalSign);

/**
 * \brief Read a one dimensional matrix (vector of length `n`) from `p` into `dest`.
 *
 * Matrix format is:
 * ```
 * (x y z w ...)
 * ```
 *
 * \return `TRUE` on success, `FALSE` on error.
 */
Boolean Lex_ParseMatrix1(Lex *p, float *dest, size_t n);
/**
 * \brief `Lex_ParseMatrix1()` variant for two dimensional matrixes.
 *
 * Matrix format is:
 * ```
 * ((x0 y0 z0 w0 ...) (x1 y1 z1 w1 ...) ...)
 * ```
 */
Boolean Lex_ParseMatrix2(Lex *p, float *dest, size_t n, size_t m);
/// `Lex_ParseMatrix1()` variant for tridimensional matrixes.
Boolean Lex_ParseMatrix3(Lex *p, float *dest, size_t n, size_t m, size_t u);

/// Discard any buffered token and any in text token up to a new line.
void Lex_SkipLine(Lex *p);
/**
 * \brief Skip every token until `tok` is encountered.
 *
 * \param [in,out] p   A lexer, must not be `NULL`
 * \param [in]     tok Token to look for, must not be `NULL`
 *
 * \return `tok` on success, `NULL` on error or end of file.
 */
char *Lex_SkipUntil(Lex *p, const char *tok);
/**
 * \brief Expect and skip section enclosed within braces.
 *
 * Braced sections are enclosed by punctuation tokens of id `P_BRACEOPEN` and
 * `P_BRACECLOSE`.
 *
 * \param [in,out] p               A lexer, must not be `NULL`
 * \param [in]     parseFirstBrace Whether the function should expect the next
 *                                 token to be the first brace of the section
 *                                 (`TRUE`) or it should assume the first brace
 *                                 has already been parsed (`FALSE`).
 *
 * \return `TRUE` if section was skipped successfully, `FALSE` on error
 *         (either unbalanced braces or unexpected token).
 */
Boolean Lex_SkipBracedSection(Lex *p, Boolean parseFirstBrace);

/**
 * \brief Expect a token, matching and returning it, raises error on mismatch.
 *
 * \param [in,out] p   A lexer, must not be `NULL`
 * \param [in]     tok Token to be expected, must not be `NULL`
 *
 * \return On success `tok` is returned, on error `NULL`.
 */
char *Lex_MatchToken(Lex *p, const char *tok);
/**
 * \brief Expect any token, raises an error if none is found.
 *
 * \param [in,out] p    A lexer, must not be `NULL`
 * \param [out]    dest Storage for returned token, must not be `NULL`
 *
 * \return On success `tok->text` is returned, on error `NULL`.
 */
char *Lex_MatchAnyToken(Lex *p, Tok *dest);
/**
 * \brief Expect a token of a specific `type` and `subtype`, raise error on mismatch.
 *
 * \param [in,out] p       A lexer, must not be `NULL`
 * \param [out]    dest    Storage for returned token, must not be `NULL`
 * \param [in]     type    Token type to be expected
 * \param [in]     subtype Subtype mask for the expected token
 *
 * \return On success `tok->text`, `NULL` otherwise.
 */
char *Lex_MatchTokenType(Lex *p, Tok *dest, int type, unsigned subtype);

/**
 * Check whether next token matches `tok`.
 *
 * If token matches it is read from `p` and returned, as in `Lex_ReadToken()`,
 * otherwise `p` is left unaltered (except for parsing errors).
 */
char *Lex_CheckToken(Lex *p, const char *tok);
/// Similar to `Lex_CheckToken()`, but matches by token `type` and `subtype`.
char *Lex_CheckTokenType(Lex *p, Tok *dest, int type, unsigned subtype);

/**
 * \brief Peek next token from `p` and test whether it matches with `tok`.
 *
 * In no case next token is consumed from `p`, lexer is left unaltered
 * (except for parsing errors).
 */
char *Lex_PeekToken(Lex *p, const char *tok);
/// Similar to `Lex_PeekToken()`, but matches by token `type` and `subtype`.
char *Lex_PeekTokenType(Lex *p, Tok *dest, int type, unsigned subtype);

/**
 * \brief Place a token back into the lexer.
 *
 * Only one token may be placed back into the lexer at a time,
 * it will be returned back on the next call to `Lex_ReadToken()`.
 */
void Lex_UngetToken(Lex *p, const Tok *tok);

#endif