cosmopolitan/third_party/ctags/tex.c

// clang-format off
/*
 *	 $Id: tex.c 666 2008-05-15 17:47:31Z dfishburn $
 *
 *	 Copyright (c) 2008, David Fishburn
 *
 *	 This source code is released for free distribution under the terms of the
 *	 GNU General Public License.
 *
 *	 This module contains functions for generating tags for TeX language files.
 *
 *	 Tex language reference:
 *		 http://en.wikibooks.org/wiki/TeX#The_Structure_of_TeX
 */

/*
 *	 INCLUDE FILES
 */
#include "third_party/ctags/general.h"	/* must always come first */
#include "libc/str/str.h"	/* to define isalpha () */
#include "libc/runtime/runtime.h"
#ifdef DEBUG
#include "libc/calls/calls.h"
#include "libc/calls/dprintf.h"
#include "libc/calls/weirdtypes.h"
#include "libc/fmt/fmt.h"
#include "libc/mem/fmt.h"
#include "libc/stdio/stdio.h"
#include "libc/stdio/temp.h"
#include "third_party/musl/tempnam.h"
#endif

#include "third_party/ctags/debug.h"
#include "third_party/ctags/entry.h"
#include "third_party/ctags/keyword.h"
#include "third_party/ctags/parse.h"
#include "third_party/ctags/read.h"
#include "third_party/ctags/routines.h"
#include "third_party/ctags/vstring.h"

/*
 *	 MACROS
 */
#define isType(token,t)		(boolean) ((token)->type == (t))
#define isKeyword(token,k)	(boolean) ((token)->keyword == (k))

/*
 *	 DATA DECLARATIONS
 */

typedef enum eException { ExceptionNone, ExceptionEOF } exception_t;

/*
 * Used to specify type of keyword.
 */
typedef enum eKeywordId {
	KEYWORD_NONE = -1,
	KEYWORD_chapter,
	KEYWORD_section,
	KEYWORD_subsection,
	KEYWORD_subsubsection,
	KEYWORD_part,
	KEYWORD_paragraph,
	KEYWORD_subparagraph,
	KEYWORD_include
} keywordId;

/*	Used to determine whether keyword is valid for the token language and
 *	what its ID is.
 */
typedef struct sKeywordDesc {
	const char *name;
	keywordId id;
} keywordDesc;

typedef enum eTokenType {
	TOKEN_UNDEFINED,
	TOKEN_CHARACTER,
	TOKEN_CLOSE_PAREN,
	TOKEN_COMMA,
	TOKEN_KEYWORD,
	TOKEN_OPEN_PAREN,
	TOKEN_IDENTIFIER,
	TOKEN_STRING,
	TOKEN_OPEN_CURLY,
	TOKEN_CLOSE_CURLY,
	TOKEN_OPEN_SQUARE,
	TOKEN_CLOSE_SQUARE,
	TOKEN_QUESTION_MARK,
	TOKEN_STAR
} tokenType;

typedef struct sTokenInfo {
	tokenType		type;
	keywordId		keyword;
	vString *		string;
	vString *		scope;
	unsigned long 	lineNumber;
	fpos_t 			filePosition;
} tokenInfo;

/*
 *	DATA DEFINITIONS
 */

static langType Lang_js;

static jmp_buf Exception;

typedef enum {
	TEXTAG_CHAPTER,
	TEXTAG_SECTION,
	TEXTAG_SUBSECTION,
	TEXTAG_SUBSUBSECTION,
	TEXTAG_PART,
	TEXTAG_PARAGRAPH,
	TEXTAG_SUBPARAGRAPH,
	TEXTAG_INCLUDE,
	TEXTAG_COUNT
} texKind;

static kindOption TexKinds [] = {
	{ TRUE,  'c', "chapter",		  "chapters"		   },
	{ TRUE,  's', "section",		  "sections"		   },
	{ TRUE,  'u', "subsection",		  "subsections"		   },
	{ TRUE,  'b', "subsubsection",	  "subsubsections"	   },
	{ TRUE,  'p', "part",			  "parts"			   },
	{ TRUE,  'P', "paragraph",		  "paragraphs"		   },
	{ TRUE,  'G', "subparagraph",	  "subparagraphs"	   },
	{ TRUE,  'i', "include",	  	  "includes"		   }
};

static const keywordDesc TexKeywordTable [] = {
	/* keyword			keyword ID */
	{ "chapter",		KEYWORD_chapter				},
	{ "section",		KEYWORD_section				},
	{ "subsection",		KEYWORD_subsection			},
	{ "subsubsection",	KEYWORD_subsubsection		},
	{ "part",			KEYWORD_part				},
	{ "paragraph",		KEYWORD_paragraph			},
	{ "subparagraph",	KEYWORD_subparagraph		},
	{ "include",		KEYWORD_include				}
};

/*
 *	 FUNCTION DEFINITIONS
 */

static boolean isIdentChar (const int c)
{
	return (boolean)
		(isalpha (c) || isdigit (c) || c == '$' ||
		  c == '_' || c == '#' || c == '-' || c == '.');
}

static void buildTexKeywordHash (void)
{
	const size_t count = sizeof (TexKeywordTable) /
		sizeof (TexKeywordTable [0]);
	size_t i;
	for (i = 0	;  i < count  ;  ++i)
	{
		const keywordDesc* const p = &TexKeywordTable [i];
		addKeyword (p->name, Lang_js, (int) p->id);
	}
}

static tokenInfo *newToken (void)
{
	tokenInfo *const token = xMalloc (1, tokenInfo);

	token->type			= TOKEN_UNDEFINED;
	token->keyword		= KEYWORD_NONE;
	token->string		= vStringNew ();
	token->scope		= vStringNew ();
	token->lineNumber   = getSourceLineNumber ();
	token->filePosition = getInputFilePosition ();

	return token;
}

static void deleteToken (tokenInfo *const token)
{
	vStringDelete (token->string);
	vStringDelete (token->scope);
	eFree (token);
}

/*
 *	 Tag generation functions
 */

static void makeConstTag (tokenInfo *const token, const texKind kind)
{
	if (TexKinds [kind].enabled )
	{
		const char *const name = vStringValue (token->string);
		tagEntryInfo e;
		initTagEntry (&e, name);

		e.lineNumber   = token->lineNumber;
		e.filePosition = token->filePosition;
		e.kindName	   = TexKinds [kind].name;
		e.kind		   = TexKinds [kind].letter;

		makeTagEntry (&e);
	}
}

static void makeTexTag (tokenInfo *const token, texKind kind)
{
	vString *	fulltag;

	if (TexKinds [kind].enabled)
	{
		/*
		 * If a scope has been added to the token, change the token
		 * string to include the scope when making the tag.
		 */
		if ( vStringLength (token->scope) > 0 )
		{
			fulltag = vStringNew ();
			vStringCopy (fulltag, token->scope);
			vStringCatS (fulltag, ".");
			vStringCatS (fulltag, vStringValue (token->string));
			vStringTerminate (fulltag);
			vStringCopy (token->string, fulltag);
			vStringDelete (fulltag);
		}
		makeConstTag (token, kind);
	}
}

/*
 *	 Parsing functions
 */

static void parseString (vString *const string, const int delimiter)
{
	boolean end = FALSE;
	while (! end)
	{
		int c = fileGetc ();
		if (c == EOF)
			end = TRUE;
		else if (c == '\\')
		{
			c = fileGetc(); /* This maybe a ' or ". */
			vStringPut (string, c);
		}
		else if (c == delimiter)
			end = TRUE;
		else
			vStringPut (string, c);
	}
	vStringTerminate (string);
}

/*
 *	Read a C identifier beginning with "firstChar" and places it into
 *	"name".
 */
static void parseIdentifier (vString *const string, const int firstChar)
{
	int c = firstChar;
	Assert (isIdentChar (c));
	do
	{
		vStringPut (string, c);
		c = fileGetc ();
	} while (isIdentChar (c));

	vStringTerminate (string);
	if (!isspace (c))
		fileUngetc (c);		/* unget non-identifier character */
}

static void readToken (tokenInfo *const token)
{
	int c;

	token->type			= TOKEN_UNDEFINED;
	token->keyword		= KEYWORD_NONE;
	vStringClear (token->string);

getNextChar:
	do
	{
		c = fileGetc ();
		token->lineNumber   = getSourceLineNumber ();
		token->filePosition = getInputFilePosition ();
	}
	while (c == '\t'  ||  c == ' ' ||  c == '\n');

	switch (c)
	{
		case EOF: longjmp (Exception, (int)ExceptionEOF);	break;
		case '(': token->type = TOKEN_OPEN_PAREN;			break;
		case ')': token->type = TOKEN_CLOSE_PAREN;			break;
		case ',': token->type = TOKEN_COMMA;				break;
		case '{': token->type = TOKEN_OPEN_CURLY;			break;
		case '}': token->type = TOKEN_CLOSE_CURLY;			break;
		case '[': token->type = TOKEN_OPEN_SQUARE;			break;
		case ']': token->type = TOKEN_CLOSE_SQUARE;			break;
		case '*': token->type = TOKEN_STAR;					break;

		case '\'':
		case '"':
				  token->type = TOKEN_STRING;
				  parseString (token->string, c);
				  token->lineNumber = getSourceLineNumber ();
				  token->filePosition = getInputFilePosition ();
				  break;

		case '\\':
				  /*
				   * All Tex tags start with a backslash.
				   * Check if the next character is an alpha character
				   * else it is not a potential tex tag.
				   */
				  c = fileGetc ();
				  if (! isalpha (c))
					  fileUngetc (c);
				  else
				  {
					  parseIdentifier (token->string, c);
					  token->lineNumber = getSourceLineNumber ();
					  token->filePosition = getInputFilePosition ();
					  token->keyword = analyzeToken (token->string, Lang_js);
					  if (isKeyword (token, KEYWORD_NONE))
						  token->type = TOKEN_IDENTIFIER;
					  else
						  token->type = TOKEN_KEYWORD;
				  }
				  break;

		case '%':
				  fileSkipToCharacter ('\n'); /* % are single line comments */
				  goto getNextChar;
				  break;

		default:
				  if (! isIdentChar (c))
					  token->type = TOKEN_UNDEFINED;
				  else
				  {
					  parseIdentifier (token->string, c);
					  token->lineNumber = getSourceLineNumber ();
					  token->filePosition = getInputFilePosition ();
					  token->type = TOKEN_IDENTIFIER;
				  }
				  break;
	}
}

static void copyToken (tokenInfo *const dest, tokenInfo *const src)
{
	dest->lineNumber = src->lineNumber;
	dest->filePosition = src->filePosition;
	dest->type = src->type;
	dest->keyword = src->keyword;
	vStringCopy (dest->string, src->string);
	vStringCopy (dest->scope, src->scope);
}

/*
 *	 Scanning functions
 */

static boolean parseTag (tokenInfo *const token, texKind kind)
{
	tokenInfo *const name = newToken ();
	vString *	fullname;
	boolean		useLongName = TRUE;

	fullname = vStringNew ();
	vStringClear (fullname);

	/*
	 * Tex tags are of these formats:
	 *   \keyword{any number of words}
	 *   \keyword[short desc]{any number of words}
	 *   \keyword*[short desc]{any number of words}
	 *
	 * When a keyword is found, loop through all words within
	 * the curly braces for the tag name.
	 */

	if (isType (token, TOKEN_KEYWORD))
	{
		copyToken (name, token);
		readToken (token);
	}

	if (isType (token, TOKEN_OPEN_SQUARE))
	{
		useLongName = FALSE;

		readToken (token);
		while (! isType (token, TOKEN_CLOSE_SQUARE) )
		{
			if (isType (token, TOKEN_IDENTIFIER))
			{
				if (fullname->length > 0)
					vStringCatS (fullname, " ");
				vStringCatS (fullname, vStringValue (token->string));
			}
			readToken (token);
		}
		vStringTerminate (fullname);
		vStringCopy (name->string, fullname);
		makeTexTag (name, kind);
	}

	if (isType (token, TOKEN_STAR))
	{
		readToken (token);
	}

	if (isType (token, TOKEN_OPEN_CURLY))
	{
		readToken (token);
		while (! isType (token, TOKEN_CLOSE_CURLY) )
		{
			/* if (isType (token, TOKEN_IDENTIFIER) && useLongName) */
			if (useLongName)
			{
				if (fullname->length > 0)
					vStringCatS (fullname, " ");
				vStringCatS (fullname, vStringValue (token->string));
			}
			readToken (token);
		}
		if (useLongName)
		{
			vStringTerminate (fullname);
			vStringCopy (name->string, fullname);
			makeTexTag (name, kind);
		}
	}

	deleteToken (name);
	vStringDelete (fullname);
	return TRUE;
}

static void parseTexFile (tokenInfo *const token)
{
	do
	{
		readToken (token);

		if (isType (token, TOKEN_KEYWORD))
		{
			switch (token->keyword)
			{
				case KEYWORD_chapter:
					parseTag (token, TEXTAG_CHAPTER);
					break;
				case KEYWORD_section:
					parseTag (token, TEXTAG_SECTION);
					break;
				case KEYWORD_subsection:
					parseTag (token, TEXTAG_SUBSECTION);
					break;
				case KEYWORD_subsubsection:
					parseTag (token, TEXTAG_SUBSUBSECTION);
					break;
				case KEYWORD_part:
					parseTag (token, TEXTAG_PART);
					break;
				case KEYWORD_paragraph:
					parseTag (token, TEXTAG_PARAGRAPH);
					break;
				case KEYWORD_subparagraph:
					parseTag (token, TEXTAG_SUBPARAGRAPH);
					break;
				case KEYWORD_include:
					parseTag (token, TEXTAG_INCLUDE);
					break;
				default:
					break;
			}
		}
	} while (TRUE);
}

static void initialize (const langType language)
{
	Assert (sizeof (TexKinds) / sizeof (TexKinds [0]) == TEXTAG_COUNT);
	Lang_js = language;
	buildTexKeywordHash ();
}

static void findTexTags (void)
{
	tokenInfo *const token = newToken ();
	exception_t exception;

	exception = (exception_t) (setjmp (Exception));
	while (exception == ExceptionNone)
		parseTexFile (token);

	deleteToken (token);
}

/* Create parser definition stucture */
extern parserDefinition* TexParser (void)
{
	static const char *const extensions [] = { "tex", NULL };
	parserDefinition *const def = parserNew ("Tex");
	def->extensions = extensions;
	/*
	 * New definitions for parsing instead of regex
	 */
	def->kinds		= TexKinds;
	def->kindCount	= KIND_COUNT (TexKinds);
	def->parser		= findTexTags;
	def->initialize = initialize;

	return def;
}
/* vi:set tabstop=4 shiftwidth=4 noexpandtab: */