PCRE: Alternativer Algo 2 verhält sich komisch

PCRE: Alternativer Algo 2 verhält sich komisch

Hallo ihr, ich hab' da ein Problem mit PCRE (http://www.pcre.org, für jene, die es nicht kennen). PCRE bietet (bekanntlich) 2 verschiedene Matching-Algorithmen an: Der Eine ist Perl-kompatibel, der Andere findet immer alle Matches, da er beim Finden eines Matches nicht in die Weiter -> Backtracking-Schleife reinhüpft. Ich hab' zum Testen mal folgendes zusammengeschustert: Code: // re.cpp : Definiert den Einstiegspunkt für die Konsolenanwendung. // #include <stdio.h> #include <stdlib.h> #include <pcre.h> #include <string.h> //For atoi #include <locale.h> //For setlocale #define ARGNUM 5 #define DEBUG int main(int argc, char *argv[]) {   if(!strncmp(argv[1],"-h",2)) {     printf("RE by TME and MB of H&H (leet!)\n");     printf("Usage: %s \"<string>\" \"<pattern>\" <algorithm> <LF-treatment>\n", argv[0]);     printf("\nParameters:\n\n");     printf("String: A string against which the pattern is matched.\n");     printf("Pattern: A pattern (regex) to match against the previous parameter.\n");     printf("Algorithm: Algorithm to use. 0 = perl-compatible matching, 1 = PCRE-algo.\n");     printf("LF-Treatment: How CR and/or LF (line terminations) should be treated.\n");     printf("  0: CR counts as a line break.\n");     printf("  1: LF counts as a line break.\n");     printf("  2: The CR/LF sequence counts as a line break.\n");     printf("  3: Any unicode newline sequence counts as a line break.\n");     return -1;   };   const char *re_string=argv[1];   const char *re_pattern=argv[2]; #ifdef DEBUG         printf("Habe re_string <%s> erkannt. Länge des Strings: %d.\n", re_string, strlen(re_string));         printf("Habe re_pattern <%s> erkannt. Länge des Pattern: %d.\n", re_pattern, strlen(re_pattern)); #endif         if(argc<ARGNUM && argc!=2) {     printf("%s \"<string>\" \"<pattern>\" <algorithm> <LF-treatment>\n", argv[0]);     printf("\"<algorithm>\" in 0 or 1.\n");     printf("<LF-treatment> in 0 to 4.\n");     system("pause");     return -1;         };         int algorithm=0;   int crlfhandler=0;         algorithm=atoi(argv[3]);   crlfhandler=atoi(argv[4]);         if(crlfhandler<0||crlfhandler>3) {                 printf("Invalid CRLF-Handling number provided (0 to 4 are supported).\n");         };   setlocale(LC_CTYPE, "german");   const unsigned char *tables;   tables = pcre_maketables();         int options=0;         switch (crlfhandler) {                   case 0:                   options|=PCRE_NEWLINE_CR;                   break;                   case 1:                           options|=PCRE_NEWLINE_LF;                           break;                 case 2:                   options|=PCRE_NEWLINE_CRLF;             break;                   case 3:                         options|=PCRE_NEWLINE_ANY;       break;           default:       break;         }; #ifdef DEBUG                printf("Optionen: %d.\n", options); #endif         pcre *re=NULL; // Das kompilierte Pattern   const char *error; // Textuale Fehlermeldung aus PCRE     // Wenn bei der Kompilierung des Pattern ein Fehler auftritt,   // wird dieser Wert auf den OFFSET des Zeichens von Anfang   // des Pattern gesetzt, wo der Kompilierungsfehler auftrat.   // Einfacher ging's leider nicht.   int erroffset=0;   re = pcre_compile(     re_pattern, //    PCRE_CASELESS|options,     PCRE_CASELESS,     &error,     &erroffset,     tables         );         if(0!=error) {                 printf("%s\n", error);     return -7;         };         pcre_extra *pe = pcre_study ( re, 0, &error ); //Nur, wenn wir weitere Informationen aus PCRE extrahieren wollen /*         void *generic=calloc(1,1024); //Allocate 1024 Byte   pcre_fullinfo(re_pattern, pe, , generic); */   int rc;   int ovector[120];   int wspace[200]; //Alternativer Algo braucht Platz zum Atmen   switch (algorithm) {     case 0: //Standard Perl Compatible Expressions       rc = pcre_exec (         re,         pe,         re_string,         strlen(re_string),         0, //Offset im String, ab wo soll gematcht werden         0, //Los Optionas! Ole!         ovector,         120       );       break;     case 1: //Extended PCRE Expressions       rc = pcre_dfa_exec(         re,         pe,         re_string, /* the subject string */         strlen(re_string),         0,         0,         ovector,         120,         wspace,         200       );       break;     default:       printf("Invalid algorithm number provided (0 and 1 supported, %i given).\n", algorithm);       return -3;   };     switch (rc) {     case 0:       break;     case PCRE_ERROR_MATCHLIMIT:       printf("pcre_exec() returned a PCRE_ERROR_MATCHLIMIT.\n");       return -1;       break;     case PCRE_ERROR_RECURSIONLIMIT:       printf("pcre_exec() returned a PCRE_ERROR_RECURSIONLIMIT.\n");       return -1;       break;   };   int i;   int substr_len;   int matchbeginn;   int matchend;   char *substring;   #ifdef DEBUG   printf("Entering matching algo ...\n"); #endif     for(i=1;i<rc;i++) { #ifdef DEBUG     printf("Processing substring number %d.\n", i); #endif         matchbeginn=ovector[i*2];     matchend=ovector[i*2+1]-1; #ifdef DEBUG        printf("Setting start of match %d to %d and end of match at %d.\n", i, matchbeginn, matchend); #endif         substr_len=matchend-matchbeginn+1; #ifdef DEBUG          printf("The length of the substring is %d.\n", substr_len); #endif         substring=(char*)calloc(1, substr_len+1);       if(!substring) {         printf("Substring memory allocation failed (%i bytes).\n", substr_len+1);         return -11;       };       char *beginn=(char *)((int)re_string + matchbeginn);       strncpy(substring, beginn, substr_len);       substring[substr_len]='\0'; //Terminated!       printf("Match %d: %s\n", i, substring);     free(substring);   };   system("pause");   return 0; } Matche ich hiermit das Pattern "(\w+)" gegen "Mando ist doof", so erhalte ich mit Algo 1 folgerichtig 1 Match ("Mando"), wie ich es auch erwarte. Algo 2 jedoch (pcre_dfa_exec()) bringt nur 4 Matches: "Mand", "Man", "Ma" und "M". Irgendwie fehlt mir jedoch die Idee, warum Match Nr. 5 ("Mando") fehlt. Hat jemand eine Idee? Thomas P.S.: Kompletter Aufruf war Code: re.exe "Mando ist doof" "Mando" 0 0 bzw. für den fehlgeschlagenen Versuch Code: re.exe "Mando ist doof" "Mando" 1 0