match.c

説明を見る。
00001 /*
00002    Copyright (C) Andrew Tridgell 1996
00003    Copyright (C) Paul Mackerras 1996
00004 
00005    This program is free software; you can redistribute it and/or modify
00006    it under the terms of the GNU General Public License as published by
00007    the Free Software Foundation; either version 2 of the License, or
00008    (at your option) any later version.
00009 
00010    This program is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013    GNU General Public License for more details.
00014 
00015    You should have received a copy of the GNU General Public License
00016    along with this program; if not, write to the Free Software
00017    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00018 */
00019 
00020 #include "rsync.h"
00021 
00022 extern int verbose;
00023 extern int do_progress;
00024 extern int checksum_seed;
00025 extern int append_mode;
00026 
00027 int updating_basis_file;
00028 
00029 static int false_alarms;
00030 static int hash_hits;
00031 static int matches;
00032 static int64 data_transfer;
00033 
00034 static int total_false_alarms;
00035 static int total_hash_hits;
00036 static int total_matches;
00037 
00038 extern struct stats stats;
00039 
00040 #define TABLESIZE (1<<16)
00041 
00042 static int32 *hash_table;
00043 
00044 #define SUM2HASH2(s1,s2) (((s1) + (s2)) & 0xFFFF)
00045 #define SUM2HASH(sum) SUM2HASH2((sum)&0xFFFF,(sum)>>16)
00046 
00047 static void build_hash_table(struct sum_struct *s)
00048 {
00049         int32 i;
00050 
00051         if (!hash_table) {
00052                 hash_table = new_array(int32, TABLESIZE);
00053                 if (!hash_table)
00054                         out_of_memory("build_hash_table");
00055         }
00056 
00057         memset(hash_table, 0xFF, TABLESIZE * sizeof hash_table[0]);
00058 
00059         for (i = 0; i < s->count; i++) {
00060                 uint32 t = SUM2HASH(s->sums[i].sum1);
00061                 s->sums[i].chain = hash_table[t];
00062                 hash_table[t] = i;
00063         }
00064 }
00065 
00066 
00067 static OFF_T last_match;
00068 
00069 
00070 /**
00071  * Transmit a literal and/or match token.
00072  *
00073  * This delightfully-named function is called either when we find a
00074  * match and need to transmit all the unmatched data leading up to it,
00075  * or when we get bored of accumulating literal data and just need to
00076  * transmit it.  As a result of this second case, it is called even if
00077  * we have not matched at all!
00078  *
00079  * @param i If >0, the number of a matched token.  If 0, indicates we
00080  * have only literal data.
00081  **/
00082 static void matched(int f, struct sum_struct *s, struct map_struct *buf,
00083                     OFF_T offset, int32 i)
00084 {
00085         int32 n = offset - last_match; /* max value: block_size (int32) */
00086         int32 j;
00087 
00088         if (verbose > 2 && i >= 0) {
00089                 rprintf(FINFO,
00090                         "match at %.0f last_match=%.0f j=%d len=%ld n=%ld\n",
00091                         (double)offset, (double)last_match, i,
00092                         (long)s->sums[i].len, (long)n);
00093         }
00094 
00095         send_token(f, i, buf, last_match, n, i < 0 ? 0 : s->sums[i].len);
00096         data_transfer += n;
00097 
00098         if (i >= 0) {
00099                 stats.matched_data += s->sums[i].len;
00100                 n += s->sums[i].len;
00101         }
00102 
00103         for (j = 0; j < n; j += CHUNK_SIZE) {
00104                 int32 n1 = MIN(CHUNK_SIZE, n - j);
00105                 sum_update(map_ptr(buf, last_match + j, n1), n1);
00106         }
00107 
00108         if (i >= 0)
00109                 last_match = offset + s->sums[i].len;
00110         else
00111                 last_match = offset;
00112 
00113         if (buf && do_progress)
00114                 show_progress(last_match, buf->file_size);
00115 }
00116 
00117 
00118 static void hash_search(int f,struct sum_struct *s,
00119                         struct map_struct *buf, OFF_T len)
00120 {
00121         OFF_T offset, end, backup;
00122         int32 k, want_i;
00123         char sum2[SUM_LENGTH];
00124         uint32 s1, s2, sum;
00125         int more;
00126         schar *map;
00127 
00128         /* want_i is used to encourage adjacent matches, allowing the RLL
00129          * coding of the output to work more efficiently. */
00130         want_i = 0;
00131 
00132         if (verbose > 2) {
00133                 rprintf(FINFO, "hash search b=%ld len=%.0f\n",
00134                         (long)s->blength, (double)len);
00135         }
00136 
00137         k = (int32)MIN(len, (OFF_T)s->blength);
00138 
00139         map = (schar *)map_ptr(buf, 0, k);
00140 
00141         sum = get_checksum1((char *)map, k);
00142         s1 = sum & 0xFFFF;
00143         s2 = sum >> 16;
00144         if (verbose > 3)
00145                 rprintf(FINFO, "sum=%.8x k=%ld\n", sum, (long)k);
00146 
00147         offset = 0;
00148 
00149         end = len + 1 - s->sums[s->count-1].len;
00150 
00151         if (verbose > 3) {
00152                 rprintf(FINFO, "hash search s->blength=%ld len=%.0f count=%.0f\n",
00153                         (long)s->blength, (double)len, (double)s->count);
00154         }
00155 
00156         do {
00157                 int done_csum2 = 0;
00158                 int32 i;
00159 
00160                 if (verbose > 4) {
00161                         rprintf(FINFO, "offset=%.0f sum=%04x%04x\n",
00162                                 (double)offset, s2 & 0xFFFF, s1 & 0xFFFF);
00163                 }
00164 
00165                 i = hash_table[SUM2HASH2(s1,s2)];
00166                 if (i < 0)
00167                         goto null_hash;
00168 
00169                 sum = (s1 & 0xffff) | (s2 << 16);
00170                 hash_hits++;
00171                 do {
00172                         int32 l;
00173 
00174                         if (sum != s->sums[i].sum1)
00175                                 continue;
00176 
00177                         /* also make sure the two blocks are the same length */
00178                         l = (int32)MIN((OFF_T)s->blength, len-offset);
00179                         if (l != s->sums[i].len)
00180                                 continue;
00181 
00182                         /* in-place: ensure chunk's offset is either >= our
00183                          * offset or that the data didn't move. */
00184                         if (updating_basis_file && s->sums[i].offset < offset
00185                             && !(s->sums[i].flags & SUMFLG_SAME_OFFSET))
00186                                 continue;
00187 
00188                         if (verbose > 3) {
00189                                 rprintf(FINFO,
00190                                         "potential match at %.0f i=%ld sum=%08x\n",
00191                                         (double)offset, (long)i, sum);
00192                         }
00193 
00194                         if (!done_csum2) {
00195                                 map = (schar *)map_ptr(buf,offset,l);
00196                                 get_checksum2((char *)map,l,sum2);
00197                                 done_csum2 = 1;
00198                         }
00199 
00200                         if (memcmp(sum2,s->sums[i].sum2,s->s2length) != 0) {
00201                                 false_alarms++;
00202                                 continue;
00203                         }
00204 
00205                         /* When updating in-place, the best possible match is
00206                          * one with an identical offset, so we prefer that over
00207                          * the following want_i optimization. */
00208                         if (updating_basis_file) {
00209                                 int32 i2;
00210                                 for (i2 = i; i2 >= 0; i2 = s->sums[i2].chain) {
00211                                         if (s->sums[i2].offset != offset)
00212                                                 continue;
00213                                         if (i2 != i) {
00214                                                 if (sum != s->sums[i2].sum1)
00215                                                         break;
00216                                                 if (memcmp(sum2, s->sums[i2].sum2,
00217                                                            s->s2length) != 0)
00218                                                         break;
00219                                                 i = i2;
00220                                         }
00221                                         /* This chunk was at the same offset on
00222                                          * both the sender and the receiver. */
00223                                         s->sums[i].flags |= SUMFLG_SAME_OFFSET;
00224                                         goto set_want_i;
00225                                 }
00226                         }
00227 
00228                         /* we've found a match, but now check to see
00229                          * if want_i can hint at a better match. */
00230                         if (i != want_i && want_i < s->count
00231                             && (!updating_basis_file || s->sums[want_i].offset >= offset
00232                              || s->sums[want_i].flags & SUMFLG_SAME_OFFSET)
00233                             && sum == s->sums[want_i].sum1
00234                             && memcmp(sum2, s->sums[want_i].sum2, s->s2length) == 0) {
00235                                 /* we've found an adjacent match - the RLL coder
00236                                  * will be happy */
00237                                 i = want_i;
00238                         }
00239                     set_want_i:
00240                         want_i = i + 1;
00241 
00242                         matched(f,s,buf,offset,i);
00243                         offset += s->sums[i].len - 1;
00244                         k = (int32)MIN((OFF_T)s->blength, len-offset);
00245                         map = (schar *)map_ptr(buf, offset, k);
00246                         sum = get_checksum1((char *)map, k);
00247                         s1 = sum & 0xFFFF;
00248                         s2 = sum >> 16;
00249                         matches++;
00250                         break;
00251                 } while ((i = s->sums[i].chain) >= 0);
00252 
00253           null_hash:
00254                 backup = offset - last_match;
00255                 /* We sometimes read 1 byte prior to last_match... */
00256                 if (backup < 0)
00257                         backup = 0;
00258 
00259                 /* Trim off the first byte from the checksum */
00260                 more = offset + k < len;
00261                 map = (schar *)map_ptr(buf, offset - backup, k + more + backup)
00262                     + backup;
00263                 s1 -= map[0] + CHAR_OFFSET;
00264                 s2 -= k * (map[0]+CHAR_OFFSET);
00265 
00266                 /* Add on the next byte (if there is one) to the checksum */
00267                 if (more) {
00268                         s1 += map[k] + CHAR_OFFSET;
00269                         s2 += s1;
00270                 } else
00271                         --k;
00272 
00273                 /* By matching early we avoid re-reading the
00274                    data 3 times in the case where a token
00275                    match comes a long way after last
00276                    match. The 3 reads are caused by the
00277                    running match, the checksum update and the
00278                    literal send. */
00279                 if (backup >= s->blength+CHUNK_SIZE && end-offset > CHUNK_SIZE)
00280                         matched(f, s, buf, offset - s->blength, -2);
00281         } while (++offset < end);
00282 
00283         matched(f, s, buf, len, -1);
00284         map_ptr(buf, len-1, 1);
00285 }
00286 
00287 
00288 /**
00289  * Scan through a origin file, looking for sections that match
00290  * checksums from the generator, and transmit either literal or token
00291  * data.
00292  *
00293  * Also calculates the MD4 checksum of the whole file, using the md
00294  * accumulator.  This is transmitted with the file as protection
00295  * against corruption on the wire.
00296  *
00297  * @param s Checksums received from the generator.  If <tt>s->count ==
00298  * 0</tt>, then there are actually no checksums for this file.
00299  *
00300  * @param len Length of the file to send.
00301  **/
00302 void match_sums(int f, struct sum_struct *s, struct map_struct *buf, OFF_T len)
00303 {
00304         char file_sum[MD4_SUM_LENGTH];
00305 
00306         last_match = 0;
00307         false_alarms = 0;
00308         hash_hits = 0;
00309         matches = 0;
00310         data_transfer = 0;
00311 
00312         sum_init(checksum_seed);
00313 
00314         if (append_mode) {
00315                 OFF_T j = 0;
00316                 for (j = CHUNK_SIZE; j < s->flength; j += CHUNK_SIZE) {
00317                         if (buf && do_progress)
00318                                 show_progress(last_match, buf->file_size);
00319                         sum_update(map_ptr(buf, last_match, CHUNK_SIZE),
00320                                    CHUNK_SIZE);
00321                         last_match = j;
00322                 }
00323                 if (last_match < s->flength) {
00324                         int32 len = s->flength - last_match;
00325                         if (buf && do_progress)
00326                                 show_progress(last_match, buf->file_size);
00327                         sum_update(map_ptr(buf, last_match, len), len);
00328                         last_match = s->flength;
00329                 }
00330                 s->count = 0;
00331         }
00332 
00333         if (len > 0 && s->count > 0) {
00334                 build_hash_table(s);
00335 
00336                 if (verbose > 2)
00337                         rprintf(FINFO,"built hash table\n");
00338 
00339                 hash_search(f,s,buf,len);
00340 
00341                 if (verbose > 2)
00342                         rprintf(FINFO,"done hash search\n");
00343         } else {
00344                 OFF_T j;
00345                 /* by doing this in pieces we avoid too many seeks */
00346                 for (j = last_match + CHUNK_SIZE; j < len; j += CHUNK_SIZE)
00347                         matched(f, s, buf, j, -2);
00348                 matched(f, s, buf, len, -1);
00349         }
00350 
00351         sum_end(file_sum);
00352         /* If we had a read error, send a bad checksum. */
00353         if (buf && buf->status != 0)
00354                 file_sum[0]++;
00355 
00356         if (verbose > 2)
00357                 rprintf(FINFO,"sending file_sum\n");
00358         write_buf(f,file_sum,MD4_SUM_LENGTH);
00359 
00360         if (verbose > 2)
00361                 rprintf(FINFO, "false_alarms=%d hash_hits=%d matches=%d\n",
00362                         false_alarms, hash_hits, matches);
00363 
00364         total_hash_hits += hash_hits;
00365         total_false_alarms += false_alarms;
00366         total_matches += matches;
00367         stats.literal_data += data_transfer;
00368 }
00369 
00370 void match_report(void)
00371 {
00372         if (verbose <= 1)
00373                 return;
00374 
00375         rprintf(FINFO,
00376                 "total: matches=%d  hash_hits=%d  false_alarms=%d data=%.0f\n",
00377                 total_matches, total_hash_hits, total_false_alarms,
00378                 (double)stats.literal_data);
00379 }

rsyncに対してSat Dec 5 19:45:41 2009に生成されました。  doxygen 1.4.7