You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
263 lines
6.4 KiB
263 lines
6.4 KiB
// Copyright 2014 Paul Sokolovsky.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
#include "re1.5.h"
|
|
|
|
static void insert_code(char *code, int at, int num, int *pc)
|
|
{
|
|
memmove(code + at + num, code + at, *pc - at);
|
|
*pc += num;
|
|
}
|
|
|
|
#define REL(at, to) (to - at - 2)
|
|
|
|
int re1_5_sizecode(const char *re)
|
|
{
|
|
int pc = 5 + NON_ANCHORED_PREFIX; // Save 0, Save 1, Match; more bytes for "search" (vs "match") prefix code
|
|
|
|
for (; *re; re++) {
|
|
switch (*re) {
|
|
case '\\':
|
|
re++;
|
|
default:
|
|
pc += 2;
|
|
break;
|
|
case '+':
|
|
// Skip entire "+?"
|
|
if (re[1] == '?')
|
|
re++;
|
|
case '?':
|
|
pc += 2;
|
|
break;
|
|
case '.':
|
|
case '^':
|
|
case '$':
|
|
pc++;
|
|
break;
|
|
case '*':
|
|
// Skip entire "*?"
|
|
if (re[1] == '?')
|
|
re++;
|
|
case '|':
|
|
case '(':
|
|
pc += 4;
|
|
break;
|
|
case ')':
|
|
break;
|
|
case '[': {
|
|
pc += 2;
|
|
re++;
|
|
if (*re == '^') re++;
|
|
while (*re != ']') {
|
|
if (!*re) return -1;
|
|
if (re[1] == '-') {
|
|
re += 2;
|
|
}
|
|
pc += 2;
|
|
re++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return pc;
|
|
}
|
|
|
|
#define EMIT(at, byte) code[at] = byte
|
|
|
|
static const char *_compilecode(const char *re, ByteProg *prog)
|
|
{
|
|
char *code = prog->insts;
|
|
int pc = prog->bytelen;
|
|
int start = pc;
|
|
int term = pc;
|
|
int alt_label = 0;
|
|
|
|
for (; *re && *re != ')'; re++) {
|
|
switch (*re) {
|
|
case '\\':
|
|
re++;
|
|
if ((*re | 0x20) == 'd' || (*re | 0x20) == 's' || (*re | 0x20) == 'w') {
|
|
term = pc;
|
|
EMIT(pc++, NamedClass);
|
|
EMIT(pc++, *re);
|
|
prog->len++;
|
|
break;
|
|
}
|
|
default:
|
|
term = pc;
|
|
EMIT(pc++, Char);
|
|
EMIT(pc++, *re);
|
|
prog->len++;
|
|
break;
|
|
case '.':
|
|
term = pc;
|
|
EMIT(pc++, Any);
|
|
prog->len++;
|
|
break;
|
|
case '[': {
|
|
int cnt;
|
|
term = pc;
|
|
re++;
|
|
if (*re == '^') {
|
|
EMIT(pc++, ClassNot);
|
|
re++;
|
|
} else {
|
|
EMIT(pc++, Class);
|
|
}
|
|
pc++; // Skip # of pair byte
|
|
prog->len++;
|
|
for (cnt = 0; *re != ']'; re++, cnt++) {
|
|
if (!*re) return NULL;
|
|
EMIT(pc++, *re);
|
|
if (re[1] == '-') {
|
|
re += 2;
|
|
}
|
|
EMIT(pc++, *re);
|
|
}
|
|
EMIT(term + 1, cnt);
|
|
break;
|
|
}
|
|
case '(': {
|
|
term = pc;
|
|
int sub = ++prog->sub;
|
|
|
|
EMIT(pc++, Save);
|
|
EMIT(pc++, 2 * sub);
|
|
prog->len++;
|
|
|
|
prog->bytelen = pc;
|
|
re = _compilecode(re + 1, prog);
|
|
if (re == NULL || *re != ')') return NULL; // error, or no matching paren
|
|
pc = prog->bytelen;
|
|
|
|
EMIT(pc++, Save);
|
|
EMIT(pc++, 2 * sub + 1);
|
|
prog->len++;
|
|
|
|
break;
|
|
}
|
|
case '?':
|
|
if (pc == term) return NULL; // nothing to repeat
|
|
insert_code(code, term, 2, &pc);
|
|
EMIT(term, Split);
|
|
EMIT(term + 1, REL(term, pc));
|
|
prog->len++;
|
|
break;
|
|
case '*':
|
|
if (pc == term) return NULL; // nothing to repeat
|
|
insert_code(code, term, 2, &pc);
|
|
EMIT(pc, Jmp);
|
|
EMIT(pc + 1, REL(pc, term));
|
|
pc += 2;
|
|
if (re[1] == '?') {
|
|
EMIT(term, RSplit);
|
|
re++;
|
|
} else {
|
|
EMIT(term, Split);
|
|
}
|
|
EMIT(term + 1, REL(term, pc));
|
|
prog->len += 2;
|
|
break;
|
|
case '+':
|
|
if (pc == term) return NULL; // nothing to repeat
|
|
if (re[1] == '?') {
|
|
EMIT(pc, Split);
|
|
re++;
|
|
} else {
|
|
EMIT(pc, RSplit);
|
|
}
|
|
EMIT(pc + 1, REL(pc, term));
|
|
pc += 2;
|
|
prog->len++;
|
|
break;
|
|
case '|':
|
|
if (alt_label) {
|
|
EMIT(alt_label, REL(alt_label, pc) + 1);
|
|
}
|
|
insert_code(code, start, 2, &pc);
|
|
EMIT(pc++, Jmp);
|
|
alt_label = pc++;
|
|
EMIT(start, Split);
|
|
EMIT(start + 1, REL(start, pc));
|
|
prog->len += 2;
|
|
break;
|
|
case '^':
|
|
EMIT(pc++, Bol);
|
|
prog->len++;
|
|
break;
|
|
case '$':
|
|
EMIT(pc++, Eol);
|
|
prog->len++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (alt_label) {
|
|
EMIT(alt_label, REL(alt_label, pc) + 1);
|
|
}
|
|
prog->bytelen = pc;
|
|
return re;
|
|
}
|
|
|
|
int re1_5_compilecode(ByteProg *prog, const char *re)
|
|
{
|
|
prog->len = 0;
|
|
prog->bytelen = 0;
|
|
prog->sub = 0;
|
|
|
|
// Add code to implement non-anchored operation ("search"),
|
|
// for anchored operation ("match"), this code will be just skipped.
|
|
// TODO: Implement search in much more efficient manner
|
|
prog->insts[prog->bytelen++] = RSplit;
|
|
prog->insts[prog->bytelen++] = 3;
|
|
prog->insts[prog->bytelen++] = Any;
|
|
prog->insts[prog->bytelen++] = Jmp;
|
|
prog->insts[prog->bytelen++] = -5;
|
|
prog->len += 3;
|
|
|
|
prog->insts[prog->bytelen++] = Save;
|
|
prog->insts[prog->bytelen++] = 0;
|
|
prog->len++;
|
|
|
|
re = _compilecode(re, prog);
|
|
if (re == NULL || *re) return 1;
|
|
|
|
prog->insts[prog->bytelen++] = Save;
|
|
prog->insts[prog->bytelen++] = 1;
|
|
prog->len++;
|
|
|
|
prog->insts[prog->bytelen++] = Match;
|
|
prog->len++;
|
|
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
cleanmarks(ByteProg *prog)
|
|
{
|
|
char *pc = prog->insts;
|
|
char *end = pc + prog->bytelen;
|
|
while (pc < end) {
|
|
*pc &= 0x7f;
|
|
switch (*pc) {
|
|
case Jmp:
|
|
case Split:
|
|
case RSplit:
|
|
case Save:
|
|
case Char:
|
|
pc++;
|
|
}
|
|
pc++;
|
|
}
|
|
}
|
|
|
|
#if 0
|
|
int main(int argc, char *argv[])
|
|
{
|
|
int pc = 0;
|
|
ByteProg *code = re1_5_compilecode(argv[1]);
|
|
re1_5_dumpcode(code);
|
|
}
|
|
#endif
|
|
|