
 
Профиль
Группа: Участник
Сообщений: 955
Регистрация: 8.8.2005
Где: At Home
Репутация: 15 Всего: 124
|
Вспомнилось... Занимался я как-то извлечением механизма обработки регулярных выражений из Perl'а... Увидел исходники... Брался несколько раз и откладывал... Потом как-то набрался мужества и осилил  Столько там встретил интересного  В общем, считай, заново переписал Ужас — одно слово. Просто — ужас Вот сейчас скачал версию 5.8.8 stable Полюбуйтесь на файл regcomp.cВот функция, имеющая отношение к оптимизации регулярного выражения... Ровно 1000 строк! Ну, чуть больше, просто в SciTE сейчас смотрю, свернул её "плюсиком", открывающая скобка на 674 строке, а следующая за свёрнутым блоком строка — 1674  Пусть Ларри Уоллу будет стыдно  Код | /* REx optimizer. Converts nodes into quickier variants "in place". Finds fixed substrings. */
/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set to the position after last scanned or to NULL. */
STATIC I32 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ /* last: Stop before this one. */ { I32 min = 0, pars = 0, code; regnode *scan = *scanp, *next; I32 delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); int is_inf_internal = 0; /* The studied chunk is infinite */ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; scan_data_t data_fake; struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
while (scan && OP(scan) != END && scan < last) { /* Peephole optimizer: */
if (PL_regkind[(U8)OP(scan)] == EXACT) { /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; #ifdef DEBUGGING regnode *stop = scan; #endif
next = scan + NODE_SZ_STR(scan); /* Skip NOTHING, merge EXACT*. */ while (n && ( PL_regkind[(U8)OP(n)] == NOTHING || (stringok && (OP(n) == OP(scan)))) && NEXT_OFF(n) && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { if (OP(n) == TAIL || n > next) stringok = 0; if (PL_regkind[(U8)OP(n)] == NOTHING) { NEXT_OFF(scan) += NEXT_OFF(n); next = n + NODE_STEP_REGNODE; #ifdef DEBUGGING if (stringok) stop = n; #endif n = regnext(n); } else if (stringok) { const int oldl = STR_LEN(scan); regnode *nnext = regnext(n);
if (oldl + STR_LEN(n) > U8_MAX) break; NEXT_OFF(scan) += NEXT_OFF(n); STR_LEN(scan) += STR_LEN(n); next = n + NODE_SZ_STR(n); /* Now we can overwrite *n : */ Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); #ifdef DEBUGGING stop = next - 1; #endif n = nnext; } }
if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) { /* Two problematic code points in Unicode casefolding of EXACT nodes:
U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
which casefold to
Unicode UTF-8
U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
This means that in case-insensitive matching (or "loose matching", as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte length of the above casefolded versions) can match a target string of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). This would rather mess up the minimum length computation.
What we'll do is to look for the tail four bytes, and then peek at the preceding two bytes to see whether we need to decrease the minimum length by four (six minus two).
Thanks to the design of UTF-8, there cannot be false matches: A sequence of valid UTF-8 bytes cannot be a subsequence of another valid sequence of UTF-8 bytes.
*/ char *s0 = STRING(scan), *s, *t; char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4; const char * const t0 = "\xcc\x88\xcc\x81"; const char * const t1 = t0 + 3;
for (s = s0 + 2; s < s2 && (t = ninstr(s, s1, t0, t1)); s = t + 4) { if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) min -= 4; } }
#ifdef DEBUGGING /* Allow dumping */ n = scan + NODE_SZ_STR(scan); while (n <= stop) { if (PL_regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) { OP(n) = OPTIMIZED; NEXT_OFF(n) = 0; } n++; } #endif } /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ if (OP(scan) != CURLYX) { const int max = (reg_off_by_arg[OP(scan)] ? I32_MAX /* I32 may be smaller than U16 on CRAYs! */ : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); int noff; regnode *n = scan; /* Skip NOTHING and LONGJMP. */ while ((n = regnext(n)) && ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n))) || ((OP(n) == LONGJMP) && (noff = ARG(n)))) && off + noff < max) off += noff; if (reg_off_by_arg[OP(scan)]) ARG(scan) = off; else NEXT_OFF(scan) = off; } /* The principal pseudo-switch. Cannot be a switch, since we look into several different things. */ if (OP(scan) == BRANCH || OP(scan) == BRANCHJ || OP(scan) == IFTHEN || OP(scan) == SUSPEND) { next = regnext(scan); code = OP(scan); if (OP(next) == code || code == IFTHEN || code == SUSPEND) { I32 max1 = 0, min1 = I32_MAX, num = 0; struct regnode_charclass_class accum; if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ scan_commit(pRExC_state, data); /* Cannot merge strings after this. */ if (flags & SCF_DO_STCLASS) cl_init_zero(pRExC_state, &accum); while (OP(scan) == code) { I32 deltanext, minnext, f = 0, fake; struct regnode_charclass_class this_class;
num++; data_fake.flags = 0; if (data) { data_fake.whilem_c = data->whilem_c; data_fake.last_closep = data->last_closep; } else data_fake.last_closep = &fake; next = regnext(scan); scan = NEXTOPER(scan); if (code != BRANCH) scan = NEXTOPER(scan); if (flags & SCF_DO_STCLASS) { cl_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } if (flags & SCF_WHILEM_VISITED_POS) f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ minnext = study_chunk(pRExC_state, &scan, &deltanext, next, &data_fake, f); if (min1 > minnext) min1 = minnext; if (max1 < minnext + deltanext) max1 = minnext + deltanext; if (deltanext == I32_MAX) is_inf = is_inf_internal = 1; scan = next; if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; if (data && (data_fake.flags & SF_HAS_EVAL)) data->flags |= SF_HAS_EVAL; if (data) data->whilem_c = data_fake.whilem_c; if (flags & SCF_DO_STCLASS) cl_or(pRExC_state, &accum, &this_class); if (code == SUSPEND) break; }
//... много строк пропущено, чтобы уместить "функцию" в одно сообщение
case NSPACE: if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); for (value = 0; value < 256; value++) if (isSPACE(value)) ANYOF_BITMAP_CLEAR(data->start_class, value); } } else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); else { for (value = 0; value < 256; value++) if (!isSPACE(value)) ANYOF_BITMAP_SET(data->start_class, value); } } break; case NSPACEL: if (flags & SCF_DO_STCLASS_AND) { if (data->start_class->flags & ANYOF_LOCALE) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); for (value = 0; value < 256; value++) if (!isSPACE(value)) ANYOF_BITMAP_CLEAR(data->start_class, value); } } else { data->start_class->flags |= ANYOF_LOCALE; ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); } break; case DIGIT: if (flags & SCF_DO_STCLASS_AND) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); for (value = 0; value < 256; value++) if (!isDIGIT(value)) ANYOF_BITMAP_CLEAR(data->start_class, value); } else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); else { for (value = 0; value < 256; value++) if (isDIGIT(value)) ANYOF_BITMAP_SET(data->start_class, value); } } break; case NDIGIT: if (flags & SCF_DO_STCLASS_AND) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); for (value = 0; value < 256; value++) if (isDIGIT(value)) ANYOF_BITMAP_CLEAR(data->start_class, value); } else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); else { for (value = 0; value < 256; value++) if (!isDIGIT(value)) ANYOF_BITMAP_SET(data->start_class, value); } } break; } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, &and_with); flags &= ~SCF_DO_STCLASS; } } else if (PL_regkind[(U8)OP(scan)] == EOL && flags & SCF_DO_SUBSTR) { data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); } else if ( PL_regkind[(U8)OP(scan)] == BRANCHJ /* Lookbehind, or need to calculate parens/evals/stclass: */ && (scan->flags || data || (flags & SCF_DO_STCLASS)) && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { /* Lookahead/lookbehind */ I32 deltanext, minnext, fake = 0; regnode *nscan; struct regnode_charclass_class intrnl; int f = 0;
data_fake.flags = 0; if (data) { data_fake.whilem_c = data->whilem_c; data_fake.last_closep = data->last_closep; } else data_fake.last_closep = &fake; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ cl_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; f |= SCF_DO_STCLASS_AND; } if (flags & SCF_WHILEM_VISITED_POS) f |= SCF_WHILEM_VISITED_POS; next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f); if (scan->flags) { if (deltanext) { vFAIL("Variable length lookbehind not implemented"); } else if (minnext > U8_MAX) { vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); } scan->flags = (U8)minnext; } if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; if (data && (data_fake.flags & SF_HAS_EVAL)) data->flags |= SF_HAS_EVAL; if (data) data->whilem_c = data_fake.whilem_c; if (f & SCF_DO_STCLASS_AND) { const int was = (data->start_class->flags & ANYOF_EOS);
cl_and(data->start_class, &intrnl); if (was) data->start_class->flags |= ANYOF_EOS; } } else if (OP(scan) == OPEN) { pars++; } else if (OP(scan) == CLOSE) { if ((I32)ARG(scan) == is_par) { next = regnext(scan);
if ( next && (OP(next) != WHILEM) && next < last) is_par = 0; /* Disable optimization */ } if (data) *(data->last_closep) = ARG(scan); } else if (OP(scan) == EVAL) { if (data) data->flags |= SF_HAS_EVAL; } else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */ if (flags & SCF_DO_SUBSTR) { scan_commit(pRExC_state,data); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ cl_anything(pRExC_state, data->start_class); flags &= ~SCF_DO_STCLASS; } /* Else: zero-length, ignore. */ scan = regnext(scan); }
finish: *scanp = scan; *deltap = is_inf_internal ? I32_MAX : delta; if (flags & SCF_DO_SUBSTR && is_inf) data->pos_delta = I32_MAX - data->pos_min; if (is_par > U8_MAX) is_par = 0; if (is_par && pars==1 && data) { data->flags |= SF_IN_PAR; data->flags &= ~SF_HAS_PAR; } else if (pars && data) { data->flags |= SF_HAS_PAR; data->flags &= ~SF_IN_PAR; } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, &and_with); return min; } |
А метка в конце — finish
|