shithub: flite

ref: 397265f5791291defa76487c2388e1898e2e433c
dir: /include/cst_regex.h/

View raw version
/*************************************************************************/
/*                                                                       */
/*                  Language Technologies Institute                      */
/*                     Carnegie Mellon University                        */
/*                        Copyright (c) 1999                             */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission is hereby granted, free of charge, to use and distribute  */
/*  this software and its documentation without restriction, including   */
/*  without limitation the rights to use, copy, modify, merge, publish,  */
/*  distribute, sublicense, and/or sell copies of this work, and to      */
/*  permit persons to whom this work is furnished to do so, subject to   */
/*  the following conditions:                                            */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*   4. The authors' names are not used to endorse or promote products   */
/*      derived from this software without specific prior written        */
/*      permission.                                                      */
/*                                                                       */
/*  CARNEGIE MELLON UNIVERSITY AND THE CONTRIBUTORS TO THIS WORK         */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL CARNEGIE MELLON UNIVERSITY NOR THE CONTRIBUTORS BE LIABLE      */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*             Author:  Alan W Black (awb@cs.cmu.edu)                    */
/*               Date:  January 2000                                     */
/*************************************************************************/
/*                                                                       */
/*  cst front-end to Henry Spencer's regex code                          */
/*                                                                       */
/*************************************************************************/

/* Includes portions or regexp.h, copyright follows: */
/*
 * Copyright (c) 1986 by University of Toronto.
 * Copyright (c) 1989, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley
 * by Henry Spencer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)regexp.h	8.1 (Berkeley) 6/2/93
 */

#ifndef _CST_REGEX_H__
#define _CST_REGEX_H__

#include "cst_file.h"
#include "cst_string.h"

/*
 * The first byte of the regexp internal "program" is actually this magic
 * number; the start node begins in the second byte.
 */
#define	CST_REGMAGIC	0234

typedef struct cst_regex_struct {
    char regstart;		/* Internal use only. */
    char reganch;		/* Internal use only. */
    char *regmust;		/* Internal use only. */
    int regmlen;		/* Internal use only. */
    int regsize;
    char *program;
} cst_regex;

#define CST_NSUBEXP  10
typedef struct cst_regstate_struct {
	const char *startp[CST_NSUBEXP];
	const char *endp[CST_NSUBEXP];
	const char *input;
	const char *bol;
} cst_regstate;

cst_regex *new_cst_regex(const char *str);
void delete_cst_regex(cst_regex *r);

int cst_regex_match(const cst_regex *r, const char *str);
cst_regstate *cst_regex_match_return(const cst_regex *r, const char *str);

/* Internal functions from original HS code */
cst_regex *hs_regcomp(const char *);
cst_regstate *hs_regexec(const cst_regex *, const char *);
void hs_regdelete(cst_regex *);

/* Works similarly to snprintf(3), in that at most max characters are
   written to out, including the trailing NUL, and the return value is
   the number of characters written, *excluding* the trailing NUL.
   Also works similarly to wcstombs(3) in that passing NULL as out
   will count the number of characters that would be written without
   doing any actual conversion, and ignoring max.  So, you could use
   it like this:

   rx = new_cst_regex("\\(.*\\)_\\(.*\\)");
   if ((rs = cst_regex_match_return(rx, "foo_bar")) != NULL) {
   	size_t n;

	n = cst_regsub(rs, "\\1_\\2_quux", NULL, 0) + 1;
	out = cst_alloc(char, n);
	cst_regsub(rs, "\\1_\\2_quux", out, n);
   } */
size_t cst_regsub(const cst_regstate *r, const char *in, char *out, size_t max);

/* Initialize the regex engine and global regex constants */
void cst_regex_init(void);

/* Regexps used in text processing (these are latin-alphabet specific
   and to some extent US English-specific) */
extern const cst_regex * const cst_rx_white;
extern const cst_regex * const cst_rx_alpha;
extern const cst_regex * const cst_rx_uppercase;
extern const cst_regex * const cst_rx_lowercase;
extern const cst_regex * const cst_rx_alphanum;
extern const cst_regex * const cst_rx_identifier;
extern const cst_regex * const cst_rx_int;
extern const cst_regex * const cst_rx_double;
extern const cst_regex * const cst_rx_commaint;
extern const cst_regex * const cst_rx_digits;
extern const cst_regex * const cst_rx_dotted_abbrev;

/* Table of regexps used in CART trees (only one so far) */
extern const cst_regex * const cst_regex_table[];
#define CST_RX_dotted_abbrev_NUM 0

#endif