/*** program cregex ***********************************************************
 *
 *
 * Name:
 *      cregex : create regex-file from PROSITE for use with PROSEARH.
 *
 *
 * Syntax:
 *      cregex <database> <regex-file>
 *
 *
 * Description:
 *      Cregex creates the file containing valid AWK regular expressions,
 *      from the native PROSITE data bank. This file is used in Kolakowskis
 *      PROSEARCH script.
 *      For appearances, the output-file may be sorted alphabetically.
 *
 *
 * Author:
 *      Jack A.M. Leunissen, CAOS/CAMM Center, Nijmegen, The Netherlands.
 *
 *
 * Version:     Date:           By:             Update:
 *      1.0     23-Oct-1990     JackL           -
 *      1.01    26-Nov-1990     JackL           Bug in check_title() fixed.
 *      1.1     26-Dec-1990     JackL           Processing altered.
 *	1.2	24-Sep-1992	JackL		Catching pattern overflow.
 *
 *
 * Copyright:
 *	(c) 1990, 1991, 1992 by Jack A.M. Leunissen, all rights reserved.
 *
 */

/*** Adress *******************************************************************
 *
 *	Jack A.M. Leunissen
 *	CAOS/CAMM Center
 *	University of Nijmegen
 *	Toernooiveld 1
 *	6525 ED Nijmegen
 *	The Netherlands
 * 	Tel: +31 - 80 - 652248
 *	Fax: +31 - 80 - 652977
 *	Email: jackl@caos.kun.nl
 */

/*** preprocessor *************************************************************
 *
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define MAXLEN 512
#define MAXNUM 100
#define MAXSTR 512
#define MAXWRD  30
#define MAXCOD  10
#define MAXTTL  80
#define MAXDOC  10

#define TRUE    1
#define FALSE   0

#define N_TERM  1
#define C_TERM  2
#define INLIST  4
#define EXCLUD  8

#define NEWLINE '\012'

/*** globals ******************************************************************
 *
 */

char pattern[MAXNUM][MAXLEN], word[MAXWRD], *ptr;
char code[MAXCOD], title[MAXTTL], patlin[MAXSTR], docu[MAXDOC];
int number;
FILE *fi, *fo;

/*** prototypes ***************************************************************
 *
 */

void open_files(int, char **);
int read_data(void);
void check_title(void);
void extract(char *, char *, char);
int get_pat(char *, char *, char);
void initialize(void);
int read_word(void);
int process(void);
int add_pattern(char *, int, int);
void print_pat(void);
int parse_word(char *, int *, int *);
void close_files(void);

/*** main *********************************************************************
 *
 */

int main(int argc, char **argv)
{
        int not_done, valid;

        open_files(argc, argv);
        while (read_data()) {
                initialize();
                do {
                        not_done = read_word();
                        valid = process();
                } while (not_done && valid);
                if (valid) print_pat();
        }
        close_files();
}

/*** open_files ***************************************************************
 *
 * Open the input-file (PROSITE.DAT) and output-file (PROSITE.REGEX).
 *
 */

void open_files(int argc, char **argv)
{
        if (argc != 3) {
                fprintf(stderr, "Usage: %s database regex-file\n", argv[0]);
                exit(0);
        }
        if ((fi = fopen(argv[1], "r")) == NULL) {
                fprintf(stderr, "%s: cannot open %s\n", argv[0], argv[1]);
                exit(1);
        }
        if ((fo = fopen(argv[2], "w")) == NULL) {
                fprintf(stderr, "%s: cannot create %s\n", argv[0], argv[2]);
                exit(2);
        }
}

/*** read_data ****************************************************************
 *
 * ID-line -> title
 * AC-line -> code
 * PA-line -> pattern
 * DO-line -> documentation
 * //-line -> end of entry
 *
 */

int read_data(void)
{
        char c, line[MAXSTR];
        int has_pat, ac_found;

        has_pat = FALSE;

        do {
                /*
                 * Find the ID-line
                 *
                 */
                if (fgets(line, MAXSTR, fi) == NULL) return(FALSE);
                while (line[0] != 'I' && line[1] != 'D') {
                        if (fgets(line, MAXSTR, fi) == NULL)
                                return(FALSE);
                }
                extract(title, line, ';');
                check_title();

                /*
                 * Find the other relevant lines
                 *
                 */
                if (fgets(line, MAXSTR, fi) == NULL) return(FALSE);
                ac_found = FALSE;
                while (line[0] != '/' && line[1] != '/') {
                        if (line[0] == 'A' && line[1] == 'C') {
                                if (ac_found)
                                        fprintf(stderr,
                                        "Too many AC-lines in %s\n",code);
                                else {
                                        extract(code, line, ';');
                                        ac_found++;
                                }
                        }
                        if (line[0] == 'P' && line[1] == 'A')  {
                                if (!get_pat(patlin, line, '.'))
                                        return(FALSE);
                                has_pat++;
                        }
                        if (line[0] == 'D' && line[1] == 'O')
                                extract(docu, line, ';');
                        if (fgets(line, MAXSTR, fi) == NULL)
                                return(FALSE);
                }
        } while (!has_pat);

        return(TRUE);
}

/*** extract ******************************************************************
 *
 * Extract a substring 'so' from string 'si', ending with character 'end'.
 *
 */

void extract(char *so, char *si, char end)
{
        while (*si != ' ') *si++;               /* skip code    */
        while (*si == ' ') *si++;               /* skip blanks  */
        while (*si != end) *so++ = *si++;       /* find end     */
        *so = '\0';
}

/*** get_pat ******************************************************************
 *
 * Extract the pattern from the PA-line(s).
 *
 */

int get_pat(char *so, char *si, char end)
{
cont:
        while (*si != ' ') *si++;               /* skip code    */
        while (*si == ' ') *si++;               /* skip blanks  */
        while (*si && *si != end && *si != NEWLINE)
                *so++ = *si++;
        if (*si != end) {
                if (fgets(si, MAXSTR, fi) == NULL) return(FALSE);
                goto cont;
        }
        *so++ = end;
        *so = '\0';
        return(TRUE);
}

/*** check_title **************************************************************
 *
 * Check the title for the occurrence of blanks, and change them into
 * underscores.
 * NOTE: This function is obsolete!
 *
 */

void check_title(void)
{
        char *s = title;
        while (*s) {
                *s = (*s == ' ') ? '_' : *s;
                *s++;
        }
}

/*** close_files **************************************************************
 *
 * Close the input- and output-file.
 *
 */

void close_files(void)
{
        fclose(fi);
        fclose(fo);
        exit(0);
}

/*** initialize ***************************************************************
 *
 * Initialize the pattern.
 *
 */

void initialize(void)
{
        int i;
        number = 1;
        for (i = 0; i < MAXNUM; pattern[i++][0] = '\0');
        ptr = patlin;
}

/*** read_word ****************************************************************
 *
 * Read a 'word', i.e. one pattern entity.
 *
 */

int read_word(void)
{
        char *w = word;

        while (*ptr != '-' && *ptr != '.') {
                *w++ = *ptr++;
        }
        *w = '\0';
        return ((int)*++ptr);
}

/*** process ******************************************************************
 *
 * Translate a pattern unit into valid AWK pattern description(s).
 *
 */

int process(void)
{
        int first, last, numold, i, j, k, ret, rval = TRUE;
        char str[MAXWRD], tmp[MAXSTR];

        ret = parse_word(str, &first, &last);

        if (ret & N_TERM) {
                /*
                 * Special case: N-terminus
                 */
                if (ret & INLIST && ! (ret & EXCLUD)) {
                        for (i = 0; i < number; i++)
                                strcpy(pattern[number+i], pattern[i]);
                        for (i = 0; i < number; i++)
                                strcat(pattern[i], "^");
                        for (i = number; i < 2*number; i++)
                                strcat(pattern[i], str);
                        number *= 2;
                }
                else {
                        tmp[0] = (ret & EXCLUD) ? '.' : '^';
                        tmp[1] = '\0';
                        for (i = 0; i < number; i++)
                                strcat(pattern[i], tmp);
                        rval = add_pattern(str, first, last);
                }
        }

        else if (ret & C_TERM) {
                /*
                 * Special case: C-terminus
                 */
                if (ret & INLIST && ! (ret & EXCLUD)) {
                        for (i = 0; i < number; i++)
                                strcpy(pattern[number+i], pattern[i]);
                        for (i = 0; i < number; i++)
                                strcat(pattern[i], "$");
                        for (i = number; i < 2*number; i++)
                                strcat(pattern[i], str);
                        number *= 2;
                }
                else {
                        rval = add_pattern(str, first, last);
                        tmp[0] = (ret & EXCLUD) ? '.' : '$';
                        tmp[1] = '\0';
                        for (i = 0; i < number; i++)
                                strcat(pattern[i], tmp);
                }
        }

        else
                /*
                 * no special case.
                 */
                rval = add_pattern(str, first, last);

	return (rval);
}

/*** add_pattern **************************************************************
 *
 */

int add_pattern(char *str, int first, int last)
{
        int numold, i, j, k;
        char tmp[MAXSTR];

        numold = number;
        if (last - first) {
                for (i = first; i < last; i++) {
			if (number+numold > MAXNUM) {
				fprintf(stderr,
"*** TO MANY PATTERNS IN \"%s\" (>%d) *** SKIPPED! ***\n", code, MAXNUM);
				return(FALSE);
			}
                        for (j = 0; j <= numold; j++)
                                strcpy(pattern[number+j], pattern[j]);
                        number += numold;
                }
        }
        for (i = first; i <= last; i++) {
                tmp[0] = '\0';
                for (j = 0; j < i; j++)
                        strcat(tmp, str);
                for (j = 0; j < numold; j++) {
                        k = numold * (i - first) + j;
                        strcat(pattern[k], tmp);
                }
        }
	return(TRUE);
}

/*** parse_word ***************************************************************
 *
 * Parse the pattern entity: translate the 'word' into AWK rules, and
 * determine the repeat factor/range, if present.
 *
 */

int parse_word(char *str, int *first, int *last)
{
        char *w = word, *s = str;
        int term, beg_list, end_list, exclude;

        *first = *last = 1;
        term = beg_list = end_list = exclude = FALSE;

        do {
                switch (*w) {

                case '<':
                        term |= N_TERM;
                        if (beg_list)
                                term |= INLIST;
                        break;
                case '>':
                        term |= C_TERM;
                        if (beg_list && !end_list)
                                term |= INLIST;
                        break;
                case 'x':
                        *s++ = '.';
                        break;
                case '[':
                        *s++ = '[';
                        beg_list = TRUE;
                        break;
                case '{':
                        *s++ = '[';
                        *s++ = '^';
                        exclude = EXCLUD;
                        beg_list = TRUE;
                        break;
                case ']':
                        *s++ = ']';
                        end_list = TRUE;
                        break;
                case '}':
                        *s++ = ']';
                        exclude = EXCLUD;
                        end_list = TRUE;
                        break;
                case '(':
                        *first = *last = 0;
                        while (*++w != ',' && *w != ')')
                                *first = *first * 10 + *w - '0';
                        if (*w == ',')
                                while (*++w != ',' && *w != ')')
                                        *last = *last * 10 + *w - '0';
                        else *last = *first;
                        break;
                default:
                        *s++ = *w;
                        break;
                }
        } while (*w++);

        return (term|exclude);
}

/*** print_pat ****************************************************************
 *
 * Store the translated pattern(s).
 *
 */

void print_pat(void)
{
        int i;

        for (i = 0; i < number; i++) {
                fprintf(fo, "%s ", code);
                fprintf(fo, "%s ", pattern[i]);
                fprintf(fo, "%s ", title);
                fprintf(fo, "%s\n", docu);
        }
}

