Changeset 1062

Show
Ignore:
Timestamp:
11/11/07 20:48:10 (14 months ago)
Author:
stephen_booth
Message:

Upgraded to 3.5.2

Location:
trunk/ThirdParty/sqlite
Files:
4 added
102 modified

Legend:

Unmodified
Added
Removed
  • trunk/ThirdParty/sqlite/ext/fts3/fts3.c

    r980 r1062  
    289289#include "fts3_tokenizer.h" 
    290290#include "sqlite3.h" 
    291 #include "sqlite3ext.h" 
    292 SQLITE_EXTENSION_INIT1 
     291#ifndef SQLITE_CORE  
     292  #include "sqlite3ext.h" 
     293  SQLITE_EXTENSION_INIT1 
     294#endif 
    293295 
    294296 
     
    311313# define TRACE(A) 
    312314#endif 
     315 
     316/* 
     317** Default span for NEAR operators. 
     318*/ 
     319#define SQLITE_FTS3_DEFAULT_NEAR_PARAM 10 
    313320 
    314321/* It is not safe to call isspace(), tolower(), or isalnum() on 
     
    13661373} 
    13671374 
    1368 /* pLeft and pRight are DLReaders positioned to the same docid. 
    1369 ** 
    1370 ** If there are no instances in pLeft or pRight where the position 
    1371 ** of pLeft is one less than the position of pRight, then this 
    1372 ** routine adds nothing to pOut. 
    1373 ** 
    1374 ** If there are one or more instances where positions from pLeft 
    1375 ** are exactly one less than positions from pRight, then add a new 
    1376 ** document record to pOut.  If pOut wants to hold positions, then 
    1377 ** include the positions from pRight that are one more than a 
    1378 ** position in pLeft.  In other words:  pRight.iPos==pLeft.iPos+1. 
    1379 */ 
    1380 static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight, 
    1381                                DLWriter *pOut){ 
     1375/*  
     1376** This function is used as part of the implementation of phrase and 
     1377** NEAR matching. 
     1378** 
     1379** pLeft and pRight are DLReaders positioned to the same docid in 
     1380** lists of type DL_POSITION. This function writes an entry to the 
     1381** DLWriter pOut for each position in pRight that is less than 
     1382** (nNear+1) greater (but not equal to or smaller) than a position  
     1383** in pLeft. For example, if nNear is 0, and the positions contained 
     1384** by pLeft and pRight are: 
     1385** 
     1386**    pLeft:  5 10 15 20 
     1387**    pRight: 6  9 17 21 
     1388** 
     1389** then the docid is added to pOut. If pOut is of type DL_POSITIONS, 
     1390** then a positionids "6" and "21" are also added to pOut. 
     1391** 
     1392** If boolean argument isSaveLeft is true, then positionids are copied 
     1393** from pLeft instead of pRight. In the example above, the positions "5" 
     1394** and "20" would be added instead of "6" and "21". 
     1395*/ 
     1396static void posListPhraseMerge( 
     1397  DLReader *pLeft,  
     1398  DLReader *pRight, 
     1399  int nNear, 
     1400  int isSaveLeft, 
     1401  DLWriter *pOut 
     1402){ 
    13821403  PLReader left, right; 
    13831404  PLWriter writer; 
     
    13951416    }else if( plrColumn(&left)>plrColumn(&right) ){ 
    13961417      plrStep(&right); 
    1397     }else if( plrPosition(&left)+1<plrPosition(&right) ){ 
    1398       plrStep(&left); 
    1399     }else if( plrPosition(&left)+1>plrPosition(&right) ){ 
     1418    }else if( plrPosition(&left)>=plrPosition(&right) ){ 
    14001419      plrStep(&right); 
    14011420    }else{ 
    1402       if( !match ){ 
    1403         plwInit(&writer, pOut, dlrDocid(pLeft)); 
    1404         match = 1; 
     1421      if( (plrPosition(&right)-plrPosition(&left))<=(nNear+1) ){ 
     1422        if( !match ){ 
     1423          plwInit(&writer, pOut, dlrDocid(pLeft)); 
     1424          match = 1; 
     1425        } 
     1426        if( !isSaveLeft ){ 
     1427          plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0); 
     1428        }else{ 
     1429          plwAdd(&writer, plrColumn(&left), plrPosition(&left), 0, 0); 
     1430        } 
     1431        plrStep(&right); 
     1432      }else{ 
     1433        plrStep(&left); 
    14051434      } 
    1406       plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0); 
    1407       plrStep(&left); 
    1408       plrStep(&right); 
    14091435    } 
    14101436  } 
     
    14191445} 
    14201446 
    1421 /* We have two doclists with positions:  pLeft and pRight. 
    1422 ** Write the phrase intersection of these two doclists into pOut. 
     1447/* 
     1448** Compare the values pointed to by the PLReaders passed as arguments.  
     1449** Return -1 if the value pointed to by pLeft is considered less than 
     1450** the value pointed to by pRight, +1 if it is considered greater 
     1451** than it, or 0 if it is equal. i.e. 
     1452** 
     1453**     (*pLeft - *pRight) 
     1454** 
     1455** A PLReader that is in the EOF condition is considered greater than 
     1456** any other. If neither argument is in EOF state, the return value of 
     1457** plrColumn() is used. If the plrColumn() values are equal, the 
     1458** comparison is on the basis of plrPosition(). 
     1459*/ 
     1460static int plrCompare(PLReader *pLeft, PLReader *pRight){ 
     1461  assert(!plrAtEnd(pLeft) || !plrAtEnd(pRight)); 
     1462 
     1463  if( plrAtEnd(pRight) || plrAtEnd(pLeft) ){ 
     1464    return (plrAtEnd(pRight) ? -1 : 1); 
     1465  } 
     1466  if( plrColumn(pLeft)!=plrColumn(pRight) ){ 
     1467    return ((plrColumn(pLeft)<plrColumn(pRight)) ? -1 : 1); 
     1468  } 
     1469  if( plrPosition(pLeft)!=plrPosition(pRight) ){ 
     1470    return ((plrPosition(pLeft)<plrPosition(pRight)) ? -1 : 1); 
     1471  } 
     1472  return 0; 
     1473} 
     1474 
     1475/* We have two doclists with positions:  pLeft and pRight. Depending 
     1476** on the value of the nNear parameter, perform either a phrase 
     1477** intersection (if nNear==0) or a NEAR intersection (if nNear>0) 
     1478** and write the results into pOut. 
    14231479** 
    14241480** A phrase intersection means that two documents only match 
    14251481** if pLeft.iPos+1==pRight.iPos. 
     1482** 
     1483** A NEAR intersection means that two documents only match if  
     1484** (abs(pLeft.iPos-pRight.iPos)<nNear). 
     1485** 
     1486** If a NEAR intersection is requested, then the nPhrase argument should 
     1487** be passed the number of tokens in the two operands to the NEAR operator 
     1488** combined. For example: 
     1489** 
     1490**       Query syntax               nPhrase 
     1491**      ------------------------------------ 
     1492**       "A B C" NEAR "D E"         5 
     1493**       A NEAR B                   2 
    14261494** 
    14271495** iType controls the type of data written to pOut.  If iType is 
     
    14311499  const char *pLeft, int nLeft, 
    14321500  const char *pRight, int nRight, 
    1433   DocListType iType, 
     1501  int nNear,            /* 0 for a phrase merge, non-zero for a NEAR merge */ 
     1502  int nPhrase,          /* Number of tokens in left+right operands to NEAR */ 
     1503  DocListType iType,    /* Type of doclist to write to pOut */ 
    14341504  DataBuffer *pOut      /* Write the combined doclist here */ 
    14351505){ 
     
    14511521      dlrStep(&right); 
    14521522    }else{ 
    1453       posListPhraseMerge(&left, &right, &writer); 
     1523      if( nNear==0 ){ 
     1524        posListPhraseMerge(&left, &right, 0, 0, &writer); 
     1525      }else{ 
     1526        /* This case occurs when two terms (simple terms or phrases) are 
     1527         * connected by a NEAR operator, span (nNear+1). i.e. 
     1528         * 
     1529         *     '"terrible company" NEAR widget' 
     1530         */ 
     1531        DataBuffer one = {0, 0, 0}; 
     1532        DataBuffer two = {0, 0, 0}; 
     1533 
     1534        DLWriter dlwriter2; 
     1535        DLReader dr1 = {0, 0, 0, 0, 0};  
     1536        DLReader dr2 = {0, 0, 0, 0, 0}; 
     1537 
     1538        dlwInit(&dlwriter2, iType, &one); 
     1539        posListPhraseMerge(&right, &left, nNear-3+nPhrase, 1, &dlwriter2); 
     1540        dlwInit(&dlwriter2, iType, &two); 
     1541        posListPhraseMerge(&left, &right, nNear-1, 0, &dlwriter2); 
     1542 
     1543        if( one.nData) dlrInit(&dr1, iType, one.pData, one.nData); 
     1544        if( two.nData) dlrInit(&dr2, iType, two.pData, two.nData); 
     1545 
     1546        if( !dlrAtEnd(&dr1) || !dlrAtEnd(&dr2) ){ 
     1547          PLReader pr1 = {0}; 
     1548          PLReader pr2 = {0}; 
     1549 
     1550          PLWriter plwriter; 
     1551          plwInit(&plwriter, &writer, dlrDocid(dlrAtEnd(&dr1)?&dr2:&dr1)); 
     1552 
     1553          if( one.nData ) plrInit(&pr1, &dr1); 
     1554          if( two.nData ) plrInit(&pr2, &dr2); 
     1555          while( !plrAtEnd(&pr1) || !plrAtEnd(&pr2) ){ 
     1556            int iCompare = plrCompare(&pr1, &pr2); 
     1557            switch( iCompare ){ 
     1558              case -1: 
     1559                plwCopy(&plwriter, &pr1); 
     1560                plrStep(&pr1); 
     1561                break; 
     1562              case 1: 
     1563                plwCopy(&plwriter, &pr2); 
     1564                plrStep(&pr2); 
     1565                break; 
     1566              case 0: 
     1567                plwCopy(&plwriter, &pr1); 
     1568                plrStep(&pr1); 
     1569                plrStep(&pr2); 
     1570                break; 
     1571            } 
     1572          } 
     1573          plwTerminate(&plwriter); 
     1574        } 
     1575        dataBufferDestroy(&one); 
     1576        dataBufferDestroy(&two); 
     1577      } 
    14541578      dlrStep(&left); 
    14551579      dlrStep(&right); 
     
    16611785 
    16621786/* A single term in a query is represented by an instances of 
    1663 ** the following structure. 
     1787** the following structure. Each word which may match against 
     1788** document content is a term. Operators, like NEAR or OR, are 
     1789** not terms. Query terms are organized as a flat list stored 
     1790** in the Query.pTerms array. 
     1791** 
     1792** If the QueryTerm.nPhrase variable is non-zero, then the QueryTerm 
     1793** is the first in a contiguous string of terms that are either part 
     1794** of the same phrase, or connected by the NEAR operator. 
     1795** 
     1796** If the QueryTerm.nNear variable is non-zero, then the token is followed  
     1797** by a NEAR operator with span set to (nNear-1). For example, the  
     1798** following query: 
     1799** 
     1800** The QueryTerm.iPhrase variable stores the index of the token within 
     1801** it's phrase, indexed starting at 1, or 1 if the token is not part  
     1802** of any phrase. 
     1803** 
     1804** For example, the data structure used to represent the following query: 
     1805** 
     1806**     ... MATCH 'sqlite NEAR/5 google NEAR/2 "search engine"' 
     1807** 
     1808** is: 
     1809** 
     1810**     {nPhrase=4, iPhrase=1, nNear=6, pTerm="sqlite"}, 
     1811**     {nPhrase=0, iPhrase=1, nNear=3, pTerm="google"}, 
     1812**     {nPhrase=0, iPhrase=1, nNear=0, pTerm="search"}, 
     1813**     {nPhrase=0, iPhrase=2, nNear=0, pTerm="engine"}, 
     1814** 
     1815** compiling the FTS3 syntax to Query structures is done by the parseQuery() 
     1816** function. 
    16641817*/ 
    16651818typedef struct QueryTerm { 
     
    16671820  short int iPhrase; /* This is the i-th term of a phrase. */ 
    16681821  short int iColumn; /* Column of the index that must match this term */ 
     1822  signed char nNear; /* term followed by a NEAR operator with span=(nNear-1) */ 
    16691823  signed char isOr;  /* this term is preceded by "OR" */ 
    16701824  signed char isNot; /* this term is preceded by "-" */ 
     
    17081862  QueryTerm *pTerms;    /* Array of terms.  Space obtained from malloc() */ 
    17091863  int nextIsOr;         /* Set the isOr flag on the next inserted term */ 
     1864  int nextIsNear;       /* Set the isOr flag on the next inserted term */ 
    17101865  int nextColumn;       /* Next word parsed must be in this column */ 
    17111866  int dfltColumn;       /* The default column */ 
     
    17241879    short int iCol;      /* The column that contains the match */ 
    17251880    short int iTerm;     /* The index in Query.pTerms[] of the matching term */ 
     1881    int iToken;          /* The index of the matching document token */ 
    17261882    short int nByte;     /* Number of bytes in the term */ 
    17271883    int iStart;          /* The offset to the first character of the term */ 
     
    29533109  Snippet *p,               /* Append the entry to this snippet */ 
    29543110  int iCol, int iTerm,      /* The column and query term */ 
     3111  int iToken,               /* Matching token in document */ 
    29553112  int iStart, int nByte     /* Offset and size of the match */ 
    29563113){ 
     
    29703127  pMatch->iCol = iCol; 
    29713128  pMatch->iTerm = iTerm; 
     3129  pMatch->iToken = iToken; 
    29723130  pMatch->iStart = iStart; 
    29733131  pMatch->nByte = nByte; 
     
    30433201        for(j=aTerm[i].iPhrase-1; j>=0; j--){ 
    30443202          int k = (iRotor-j) & FTS3_ROTOR_MASK; 
    3045           snippetAppendMatch(pSnippet, iColumn, i-j, 
     3203          snippetAppendMatch(pSnippet, iColumn, i-j, iPos-j, 
    30463204                iRotorBegin[k], iRotorLen[k]); 
    30473205        } 
     
    30543212} 
    30553213 
     3214/* 
     3215** Remove entries from the pSnippet structure to account for the NEAR 
     3216** operator. When this is called, pSnippet contains the list of token  
     3217** offsets produced by treating all NEAR operators as AND operators. 
     3218** This function removes any entries that should not be present after 
     3219** accounting for the NEAR restriction. For example, if the queried 
     3220** document is: 
     3221** 
     3222**     "A B C D E A" 
     3223** 
     3224** and the query is: 
     3225**  
     3226**     A NEAR/0 E 
     3227** 
     3228** then when this function is called the Snippet contains token offsets 
     3229** 0, 4 and 5. This function removes the "0" entry (because the first A 
     3230** is not near enough to an E). 
     3231*/ 
     3232static void trimSnippetOffsetsForNear(Query *pQuery, Snippet *pSnippet){ 
     3233  int ii; 
     3234  int iDir = 1; 
     3235 
     3236  while(iDir>-2) { 
     3237    assert( iDir==1 || iDir==-1 ); 
     3238    for(ii=0; ii<pSnippet->nMatch; ii++){ 
     3239      int jj; 
     3240      int nNear; 
     3241      struct snippetMatch *pMatch = &pSnippet->aMatch[ii]; 
     3242      QueryTerm *pQueryTerm = &pQuery->pTerms[pMatch->iTerm]; 
     3243 
     3244      if( (pMatch->iTerm+iDir)<0  
     3245       || (pMatch->iTerm+iDir)>=pQuery->nTerms 
     3246      ){ 
     3247        continue; 
     3248      } 
     3249      
     3250      nNear = pQueryTerm->nNear; 
     3251      if( iDir<0 ){ 
     3252        nNear = pQueryTerm[-1].nNear; 
     3253      } 
     3254   
     3255      if( pMatch->iTerm>=0 && nNear ){ 
     3256        int isOk = 0; 
     3257        int iNextTerm = pMatch->iTerm+iDir; 
     3258        int iPrevTerm = iNextTerm; 
     3259 
     3260        int iEndToken; 
     3261        int iStartToken; 
     3262 
     3263        if( iDir<0 ){ 
     3264          int nPhrase = 1; 
     3265          iStartToken = pMatch->iToken; 
     3266          while( (pMatch->iTerm+nPhrase)<pQuery->nTerms  
     3267              && pQuery->pTerms[pMatch->iTerm+nPhrase].iPhrase>1  
     3268          ){ 
     3269            nPhrase++; 
     3270          } 
     3271          iEndToken = iStartToken + nPhrase - 1; 
     3272        }else{ 
     3273          iEndToken   = pMatch->iToken; 
     3274          iStartToken = pMatch->iToken+1-pQueryTerm->iPhrase; 
     3275        } 
     3276 
     3277        while( pQuery->pTerms[iNextTerm].iPhrase>1 ){ 
     3278          iNextTerm--; 
     3279        } 
     3280        while( (iPrevTerm+1)<pQuery->nTerms &&  
     3281               pQuery->pTerms[iPrevTerm+1].iPhrase>1  
     3282        ){ 
     3283          iPrevTerm++; 
     3284        } 
     3285   
     3286        for(jj=0; isOk==0 && jj<pSnippet->nMatch; jj++){ 
     3287          struct snippetMatch *p = &pSnippet->aMatch[jj]; 
     3288          if( p->iCol==pMatch->iCol && (( 
     3289               p->iTerm==iNextTerm &&  
     3290               p->iToken>iEndToken &&  
     3291               p->iToken<=iEndToken+nNear 
     3292          ) || ( 
     3293               p->iTerm==iPrevTerm &&  
     3294               p->iToken<iStartToken &&  
     3295               p->iToken>=iStartToken-nNear 
     3296          ))){ 
     3297            isOk = 1; 
     3298          } 
     3299        } 
     3300        if( !isOk ){ 
     3301          for(jj=1-pQueryTerm->iPhrase; jj<=0; jj++){ 
     3302            pMatch[jj].iTerm = -1; 
     3303          } 
     3304          ii = -1; 
     3305          iDir = 1; 
     3306        } 
     3307      } 
     3308    } 
     3309    iDir -= 2; 
     3310  } 
     3311} 
    30563312 
    30573313/* 
     
    30843340    snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc); 
    30853341  } 
     3342 
     3343  trimSnippetOffsetsForNear(&p->q, &p->snippet); 
    30863344} 
    30873345 
     
    30993357  for(i=0; i<p->nMatch; i++){ 
    31003358    struct snippetMatch *pMatch = &p->aMatch[i]; 
    3101     zBuf[0] = ' '; 
    3102     sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol, 
    3103         pMatch->iTerm, pMatch->iStart, pMatch->nByte); 
    3104     append(&sb, zBuf); 
    3105     cnt++; 
     3359    if( pMatch->iTerm>=0 ){ 
     3360      /* If snippetMatch.iTerm is less than 0, then the match was  
     3361      ** discarded as part of processing the NEAR operator (see the  
     3362      ** trimSnippetOffsetsForNear() function for details). Ignore  
     3363      ** it in this case 
     3364      */ 
     3365      zBuf[0] = ' '; 
     3366      sprintf(&zBuf[cnt>0], "%d %d %d %d", pMatch->iCol, 
     3367          pMatch->iTerm, pMatch->iStart, pMatch->nByte); 
     3368      append(&sb, zBuf); 
     3369      cnt++; 
     3370    } 
    31063371  } 
    31073372  p->zOffset = stringBufferData(&sb); 
     
    33513616*/ 
    33523617static int docListOfTerm( 
    3353   fulltext_vtab *v,   /* The full text index */ 
    3354   int iColumn,        /* column to restrict to.  No restriction if >=nColumn */ 
    3355   QueryTerm *pQTerm,  /* Term we are looking for, or 1st term of a phrase */ 
    3356   DataBuffer *pResult /* Write the result here */ 
     3618  fulltext_vtab *v,    /* The full text index */ 
     3619  int iColumn,         /* column to restrict to.  No restriction if >=nColumn */ 
     3620  QueryTerm *pQTerm,   /* Term we are looking for, or 1st term of a phrase */ 
     3621  DataBuffer *pResult  /* Write the result here */ 
    33573622){ 
    33583623  DataBuffer left, right, new; 
     
    33673632  dataBufferInit(&left, 0); 
    33683633  rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix, 
    3369                   0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left); 
     3634                  (0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS), &left); 
    33703635  if( rc ) return rc; 
    33713636  for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){ 
     3637    /* If this token is connected to the next by a NEAR operator, and 
     3638    ** the next token is the start of a phrase, then set nPhraseRight 
     3639    ** to the number of tokens in the phrase. Otherwise leave it at 1. 
     3640    */ 
     3641    int nPhraseRight = 1; 
     3642    while( (i+nPhraseRight)<=pQTerm->nPhrase  
     3643        && pQTerm[i+nPhraseRight].nNear==0  
     3644    ){ 
     3645      nPhraseRight++; 
     3646    } 
     3647 
    33723648    dataBufferInit(&right, 0); 
    33733649    rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, 
     
    33793655    dataBufferInit(&new, 0); 
    33803656    docListPhraseMerge(left.pData, left.nData, right.pData, right.nData, 
    3381                        i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new); 
     3657                       pQTerm[i-1].nNear, pQTerm[i-1].iPhrase + nPhraseRight, 
     3658                       ((i<pQTerm->nPhrase) ? DL_POSITIONS : DL_DOCIDS), 
     3659                       &new); 
    33823660    dataBufferDestroy(&left); 
    33833661    dataBufferDestroy(&right); 
     
    34713749      continue; 
    34723750    } 
    3473     if( !inPhrase && pQuery->nTerms>0 && nToken==2 
    3474          && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){ 
     3751    if( !inPhrase && pQuery->nTerms>0 && nToken==2  
     3752     && pSegment[iBegin+0]=='O' 
     3753     && pSegment[iBegin+1]=='R'  
     3754    ){ 
    34753755      pQuery->nextIsOr = 1; 
    34763756      continue; 
    34773757    }