Main Page | Modules | Data Structures | File List | Data Fields | Globals | Related Pages

file/src/ascmagic.c

Go to the documentation of this file.
00001 /*
00002  * ASCII magic -- file types that we know based on keywords
00003  * that can appear anywhere in the file.
00004  *
00005  * Copyright (c) Ian F. Darwin, 1987.
00006  * Written by Ian F. Darwin.
00007  *
00008  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
00009  * to handle character codes other than ASCII on a unified basis.
00010  *
00011  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
00012  * international characters, now subsumed into this file.
00013  */
00014 
00015 /*
00016  * This software is not subject to any license of the American Telephone
00017  * and Telegraph Company or of the Regents of the University of California.
00018  *
00019  * Permission is granted to anyone to use this software for any purpose on
00020  * any computer system, and to alter it and redistribute it freely, subject
00021  * to the following restrictions:
00022  *
00023  * 1. The author is not responsible for the consequences of use of this
00024  *    software, no matter how awful, even if they arise from flaws in it.
00025  *
00026  * 2. The origin of this software must not be misrepresented, either by
00027  *    explicit claim or by omission.  Since few users ever read sources,
00028  *    credits must appear in the documentation.
00029  *
00030  * 3. Altered versions must be plainly marked as such, and must not be
00031  *    misrepresented as being the original software.  Since few users
00032  *    ever read sources, credits must appear in the documentation.
00033  *
00034  * 4. This notice may not be removed or altered.
00035  */
00036 
00037 #include "system.h"
00038 #include "file.h"
00039 #include "names.h"
00040 #include "tar.h"
00041 #include "debug.h"
00042 
00043 FILE_RCSID("@(#)Id: ascmagic.c,v 1.32 2002/07/03 18:26:37 christos Exp ")
00044 
00045 /*@access fmagic @*/
00046 
00047 /*
00048  * Stolen (by the author!) from the public domain tar program:
00049  * Public Domain version written 26 Aug 1985 John Gilmore (ihnp4!hoptoad!gnu).
00050  */
00051 #define isodigit(c)     ( ((c) >= '0') && ((c) <= '7') )
00052 
00053 /*
00054  * Quick and dirty octal conversion.
00055  *
00056  * Result is -1 if the field is invalid (all blank, or nonoctal).
00057  */
00058 /*@-bounds@*/
00059 static int
00060 from_oct(int digs, char *where)
00061         /*@*/
00062 {
00063         int     value;
00064 
00065         while (isspace((unsigned char)*where)) {        /* Skip spaces */
00066                 where++;
00067                 if (--digs <= 0)
00068                         return -1;              /* All blank field */
00069         }
00070         value = 0;
00071 /*@-shiftimplementation@*/
00072         while (digs > 0 && isodigit(*where)) {  /* Scan til nonoctal */
00073                 value = (value << 3) | (*where++ - '0');
00074                 --digs;
00075         }
00076 /*@=shiftimplementation@*/
00077 
00078         if (digs > 0 && *where && !isspace((unsigned char)*where))
00079                 return -1;                      /* Ended on non-space/nul */
00080 
00081         return value;
00082 }
00083 /*@=bounds@*/
00084 
00085 /*
00086  * Return 
00087  *      0 if the checksum is bad (i.e., probably not a tar archive), 
00088  *      1 for old UNIX tar file,
00089  *      2 for Unix Std (POSIX) tar file.
00090  */
00091 static int
00092 is_tar(const fmagic fm)
00093         /*@*/
00094 {
00095         int nb = fm->nb;
00096         union record *header = (union record *)fm->buf;
00097         int     i;
00098         int     sum, recsum;
00099         char    *p;
00100 
00101         if (nb < sizeof(*header))
00102                 return 0;
00103 
00104         recsum = from_oct(8,  header->header.chksum);
00105 
00106         sum = 0;
00107         p = header->charptr;
00108 /*@-sizeoftype@*/
00109         for (i = sizeof(union record); --i >= 0;)
00110 /*@=sizeoftype@*/
00111         {
00112                 /*
00113                  * We can't use unsigned char here because of old compilers,
00114                  * e.g. V7.
00115                  */
00116                 sum += 0xFF & *p++;
00117         }
00118 
00119         /* Adjust checksum to count the "chksum" field as blanks. */
00120         for (i = sizeof(header->header.chksum); --i >= 0;)
00121                 sum -= 0xFF & header->header.chksum[i];
00122         sum += ' ' * sizeof header->header.chksum;      
00123 
00124         if (sum != recsum)
00125                 return 0;       /* Not a tar archive */
00126         
00127         if (!strcmp(header->header.magic, TARMAGIC)) 
00128                 return 2;               /* Unix Standard tar archive */
00129 
00130         return 1;                       /* Old fashioned tar archive */
00131 }
00132 typedef unsigned long unichar;
00133 
00134 #define MAXLINELEN 300  /* longest sane line length */
00135 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
00136                   || (x) == 0x85 || (x) == '\f')
00137 
00138 /*
00139  * This table reflects a particular philosophy about what constitutes
00140  * "text," and there is room for disagreement about it.
00141  *
00142  * Version 3.31 of the file command considered a file to be ASCII if
00143  * each of its characters was approved by either the isascii() or
00144  * isalpha() function.  On most systems, this would mean that any
00145  * file consisting only of characters in the range 0x00 ... 0x7F
00146  * would be called ASCII text, but many systems might reasonably
00147  * consider some characters outside this range to be alphabetic,
00148  * so the file command would call such characters ASCII.  It might
00149  * have been more accurate to call this "considered textual on the
00150  * local system" than "ASCII."
00151  *
00152  * It considered a file to be "International language text" if each
00153  * of its characters was either an ASCII printing character (according
00154  * to the real ASCII standard, not the above test), a character in
00155  * the range 0x80 ... 0xFF, or one of the following control characters:
00156  * backspace, tab, line feed, vertical tab, form feed, carriage return,
00157  * escape.  No attempt was made to determine the language in which files
00158  * of this type were written.
00159  *
00160  *
00161  * The table below considers a file to be ASCII if all of its characters
00162  * are either ASCII printing characters (again, according to the X3.4
00163  * standard, not isascii()) or any of the following controls: bell,
00164  * backspace, tab, line feed, form feed, carriage return, esc, nextline.
00165  *
00166  * I include bell because some programs (particularly shell scripts)
00167  * use it literally, even though it is rare in normal text.  I exclude
00168  * vertical tab because it never seems to be used in real text.  I also
00169  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
00170  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
00171  * character to.  It might be more appropriate to include it in the 8859
00172  * set instead of the ASCII set, but it's got to be included in *something*
00173  * we recognize or EBCDIC files aren't going to be considered textual.
00174  * Some old Unix source files use SO/SI (^N/^O) to shift between Greek
00175  * and Latin characters, so these should possibly be allowed.  But they
00176  * make a real mess on VT100-style displays if they're not paired properly,
00177  * so we are probably better off not calling them text.
00178  *
00179  * A file is considered to be ISO-8859 text if its characters are all
00180  * either ASCII, according to the above definition, or printing characters
00181  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
00182  *
00183  * Finally, a file is considered to be international text from some other
00184  * character code if its characters are all either ISO-8859 (according to
00185  * the above definition) or characters in the range 0x80 ... 0x9F, which
00186  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
00187  * consider to be printing characters.
00188  */
00189 
00190 #define F 0   /* character never appears in text */
00191 #define T 1   /* character appears in plain ASCII text */
00192 #define I 2   /* character appears in ISO-8859 text */
00193 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
00194 
00195 /*@unchecked@*/ /*@observer@*/
00196 static char text_chars[256] = {
00197         /*                  BEL BS HT LF    FF CR    */
00198         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
00199         /*                              ESC          */
00200         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
00201         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
00202         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
00203         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
00204         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
00205         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
00206         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
00207         /*            NEL                            */
00208         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
00209         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
00210         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
00211         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
00212         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
00213         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
00214         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
00215         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
00216 };
00217 
00218 /*@-bounds@*/
00219 static int
00220 looks_ascii(const unsigned char *buf, int nb,
00221                 /*@out@*/ unichar *ubuf, /*@out@*/ int *ulen)
00222         /*@modifies *ubuf, *ulen @*/
00223 {
00224         int i;
00225 
00226         *ulen = 0;
00227 
00228         for (i = 0; i < nb; i++) {
00229                 int t = text_chars[buf[i]];
00230 
00231                 if (t != T)
00232                         return 0;
00233 
00234                 ubuf[(*ulen)++] = buf[i];
00235         }
00236 
00237         return 1;
00238 }
00239 /*@=bounds@*/
00240 
00241 /*@-bounds@*/
00242 static int
00243 looks_latin1(const unsigned char *buf, int nb,
00244                 /*@out@*/ unichar *ubuf, /*@out@*/ int *ulen)
00245         /*@modifies *ubuf, *ulen @*/
00246 {
00247         int i;
00248 
00249         *ulen = 0;
00250 
00251         for (i = 0; i < nb; i++) {
00252                 int t = text_chars[buf[i]];
00253 
00254                 if (t != T && t != I)
00255                         return 0;
00256 
00257                 ubuf[(*ulen)++] = buf[i];
00258         }
00259 
00260         return 1;
00261 }
00262 /*@=bounds@*/
00263 
00264 /*@-bounds@*/
00265 static int
00266 looks_extended(const unsigned char *buf, int nb,
00267                 /*@out@*/ unichar *ubuf, /*@out@*/ int *ulen)
00268         /*@modifies *ubuf, *ulen @*/
00269 {
00270         int i;
00271 
00272         *ulen = 0;
00273 
00274         for (i = 0; i < nb; i++) {
00275                 int t = text_chars[buf[i]];
00276 
00277                 if (t != T && t != I && t != X)
00278                         return 0;
00279 
00280                 ubuf[(*ulen)++] = buf[i];
00281         }
00282 
00283         return 1;
00284 }
00285 /*@=bounds@*/
00286 
00287 /*@-bounds@*/
00288 static int
00289 looks_utf8(const unsigned char *buf, int nb,
00290                 /*@out@*/ unichar *ubuf, /*@out@*/ int *ulen)
00291         /*@modifies *ubuf, *ulen @*/
00292 {
00293         int i, n;
00294         unichar c;
00295         int gotone = 0;
00296 
00297         *ulen = 0;
00298 
00299         for (i = 0; i < nb; i++) {
00300                 if ((buf[i] & 0x80) == 0) {        /* 0xxxxxxx is plain ASCII */
00301                         /*
00302                          * Even if the whole file is valid UTF-8 sequences,
00303                          * still reject it if it uses weird control characters.
00304                          */
00305 
00306                         if (text_chars[buf[i]] != T)
00307                                 return 0;
00308 
00309                         ubuf[(*ulen)++] = buf[i];
00310                 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
00311                         return 0;
00312                 } else {                           /* 11xxxxxx begins UTF-8 */
00313                         int following;
00314 
00315                         if ((buf[i] & 0x20) == 0) {             /* 110xxxxx */
00316                                 c = buf[i] & 0x1f;
00317                                 following = 1;
00318                         } else if ((buf[i] & 0x10) == 0) {      /* 1110xxxx */
00319                                 c = buf[i] & 0x0f;
00320                                 following = 2;
00321                         } else if ((buf[i] & 0x08) == 0) {      /* 11110xxx */
00322                                 c = buf[i] & 0x07;
00323                                 following = 3;
00324                         } else if ((buf[i] & 0x04) == 0) {      /* 111110xx */
00325                                 c = buf[i] & 0x03;
00326                                 following = 4;
00327                         } else if ((buf[i] & 0x02) == 0) {      /* 1111110x */
00328                                 c = buf[i] & 0x01;
00329                                 following = 5;
00330                         } else
00331                                 return 0;
00332 
00333                         for (n = 0; n < following; n++) {
00334                                 i++;
00335                                 if (i >= nb)
00336                                         goto done;
00337 
00338                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
00339                                         return 0;
00340 
00341                                 c = (c << 6) + (buf[i] & 0x3f);
00342                         }
00343 
00344                         ubuf[(*ulen)++] = c;
00345                         gotone = 1;
00346                 }
00347         }
00348 done:
00349         return gotone;   /* don't claim it's UTF-8 if it's all 7-bit */
00350 }
00351 /*@=bounds@*/
00352 
00353 /*@-bounds@*/
00354 static int
00355 looks_unicode(const unsigned char *buf, int nb,
00356                 /*@out@*/ unichar *ubuf, /*@out@*/ int *ulen)
00357         /*@modifies *ubuf, *ulen @*/
00358 {
00359         int bigend;
00360         int i;
00361 
00362         if (nb < 2)
00363                 return 0;
00364 
00365         if (buf[0] == 0xff && buf[1] == 0xfe)
00366                 bigend = 0;
00367         else if (buf[0] == 0xfe && buf[1] == 0xff)
00368                 bigend = 1;
00369         else
00370                 return 0;
00371 
00372         *ulen = 0;
00373 
00374         for (i = 2; i + 1 < nb; i += 2) {
00375                 /* XXX fix to properly handle chars > 65536 */
00376 
00377                 if (bigend)
00378                         ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
00379                 else
00380                         ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
00381 
00382                 if (ubuf[*ulen - 1] == 0xfffe)
00383                         return 0;
00384                 if (ubuf[*ulen - 1] < 128 && text_chars[ubuf[*ulen - 1]] != T)
00385                         return 0;
00386         }
00387 
00388         return 1;
00389 }
00390 /*@=bounds@*/
00391 
00392 #undef F
00393 #undef T
00394 #undef I
00395 #undef X
00396 
00397 /*
00398  * This table maps each EBCDIC character to an (8-bit extended) ASCII
00399  * character, as specified in the rationale for the dd(1) command in
00400  * draft 11.2 (September, 1991) of the POSIX P1003.2 standard.
00401  *
00402  * Unfortunately it does not seem to correspond exactly to any of the
00403  * five variants of EBCDIC documented in IBM's _Enterprise Systems
00404  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
00405  * Edition, July, 1999, pp. I-1 - I-4.
00406  *
00407  * Fortunately, though, all versions of EBCDIC, including this one, agree
00408  * on most of the printing characters that also appear in (7-bit) ASCII.
00409  * Of these, only '|', '!', '~', '^', '[', and ']' are in question at all.
00410  *
00411  * Fortunately too, there is general agreement that codes 0x00 through
00412  * 0x3F represent control characters, 0x41 a nonbreaking space, and the
00413  * remainder printing characters.
00414  *
00415  * This is sufficient to allow us to identify EBCDIC text and to distinguish
00416  * between old-style and internationalized examples of text.
00417  */
00418 
00419 /*@unchecked@*/ /*@observer@*/
00420 static unsigned char ebcdic_to_ascii[] = {
00421   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
00422  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
00423 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
00424 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
00425 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
00426 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
00427 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
00428 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
00429 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
00430 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
00431 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
00432 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
00433 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
00434 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
00435 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
00436 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
00437 };
00438 
00439 /*
00440  * The following EBCDIC-to-ASCII table may relate more closely to reality,
00441  * or at least to modern reality.  It comes from
00442  *
00443  *   http://ftp.s390.ibm.com/products/oe/bpxqp9.html
00444  *
00445  * and maps the characters of EBCDIC code page 1047 (the code used for
00446  * Unix-derived software on IBM's 390 systems) to the corresponding
00447  * characters from ISO 8859-1.
00448  *
00449  * If this table is used instead of the above one, some of the special
00450  * cases for the NEL character can be taken out of the code.
00451  */
00452 
00453 #ifdef  UNUSED
00454 /*@unchecked@*/ /*@unused@*/ /*@observer@*/
00455 static unsigned char ebcdic_1047_to_8859[] = {
00456 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
00457 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
00458 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
00459 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
00460 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
00461 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
00462 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
00463 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
00464 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
00465 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
00466 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
00467 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
00468 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
00469 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
00470 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
00471 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
00472 };
00473 #endif
00474 
00475 /*
00476  * Copy buf[0 ... nb-1] into out[], translating EBCDIC to ASCII.
00477  */
00478 /*@-bounds@*/
00479 static void
00480 from_ebcdic(const unsigned char *buf, int nb, /*@out@*/ unsigned char *otp)
00481         /*@modifies *otp @*/
00482 {
00483         int i;
00484 
00485         for (i = 0; i < nb; i++) {
00486                 otp[i] = ebcdic_to_ascii[buf[i]];
00487         }
00488 }
00489 /*@=bounds@*/
00490 
00491 /*@-bounds@*/
00492 static int
00493 fmagicAMatch(const unsigned char *s, const unichar *us, int ulen)
00494         /*@*/
00495 {
00496         size_t i;
00497 
00498         for (i = 0; i < ulen; i++) {
00499                 if (s[i] != us[i])
00500                         return 0;
00501         }
00502 
00503         if (s[i])
00504                 return 0;
00505         else
00506                 return 1;
00507 }
00508 /*@=bounds@*/
00509 
00510 /* int nb: size actually read */
00511 /*@-bounds@*/
00512 int
00513 fmagicA(fmagic fm)
00514 {
00515         unsigned char * buf = fm->buf;
00516         int nb = fm->nb;
00517 
00518         char nbuf[HOWMANY+1];           /* one extra for terminating '\0' */
00519         unichar ubuf[HOWMANY+1];        /* one extra for terminating '\0' */
00520         int ulen;
00521         struct names *p;
00522         int i;
00523 
00524         const char *code = NULL;
00525         const char *code_mime = NULL;
00526         const char *type = NULL;
00527         const char *subtype = NULL;
00528         const char *subtype_mime = NULL;
00529 
00530         int has_escapes = 0;
00531         int has_backspace = 0;
00532 
00533         int n_crlf = 0;
00534         int n_lf = 0;
00535         int n_cr = 0;
00536         int n_nel = 0;
00537 
00538         int last_line_end = -1;
00539         int has_long_lines = 0;
00540 
00541         /*
00542          * Do the tar test first, because if the first file in the tar
00543          * archive starts with a dot, we can confuse it with an nroff file.
00544          */
00545         switch (is_tar(fm)) {
00546         case 1:
00547                 file_printf(fm, ((fm->flags & FMAGIC_FLAGS_MIME)
00548                         ? "application/x-tar" : "tar archive"));
00549                 return 1;
00550         case 2:
00551                 file_printf(fm, ((fm->flags & FMAGIC_FLAGS_MIME)
00552                         ? "application/x-tar, POSIX" : "POSIX tar archive"));
00553                 return 1;
00554         }
00555 
00556         /*
00557          * Undo the NUL-termination kindly provided by fmagicProcess()
00558          * but leave at least one byte to look at
00559          */
00560 
00561         while (nb > 1 && buf[nb - 1] == '\0')
00562                 nb--;
00563 
00564         /*
00565          * Then try to determine whether it's any character code we can
00566          * identify.  Each of these tests, if it succeeds, will leave
00567          * the text converted into one-unichar-per-character Unicode in
00568          * ubuf, and the number of characters converted in ulen.
00569          */
00570         if (looks_ascii(buf, nb, ubuf, &ulen)) {
00571                 code = "ASCII";
00572                 code_mime = "us-ascii";
00573                 type = "text";
00574         } else if (looks_utf8(buf, nb, ubuf, &ulen)) {
00575                 code = "UTF-8 Unicode";
00576                 code_mime = "utf-8";
00577                 type = "text";
00578         } else if ((i = looks_unicode(buf, nb, ubuf, &ulen))) {
00579                 if (i == 1)
00580                         code = "Little-endian UTF-16 Unicode";
00581                 else
00582                         code = "Big-endian UTF-16 Unicode";
00583 
00584                 type = "character data";
00585                 code_mime = "utf-16";    /* is this defined? */
00586         } else if (looks_latin1(buf, nb, ubuf, &ulen)) {
00587                 code = "ISO-8859";
00588                 type = "text";
00589                 code_mime = "iso-8859-1"; 
00590         } else if (looks_extended(buf, nb, ubuf, &ulen)) {
00591                 code = "Non-ISO extended-ASCII";
00592                 type = "text";
00593                 code_mime = "unknown";
00594         } else {
00595                 from_ebcdic(buf, nb, nbuf);
00596 
00597                 if (looks_ascii(nbuf, nb, ubuf, &ulen)) {
00598                         code = "EBCDIC";
00599                         type = "character data";
00600                         code_mime = "ebcdic";
00601                 } else if (looks_latin1(nbuf, nb, ubuf, &ulen)) {
00602                         code = "International EBCDIC";
00603                         type = "character data";
00604                         code_mime = "ebcdic";
00605                 } else {
00606                         return 0;  /* doesn't look like text at all */
00607                 }
00608         }
00609 
00610         /*
00611          * for troff, look for . + letter + letter or .\";
00612          * this must be done to disambiguate tar archives' ./file
00613          * and other trash from real troff input.
00614          *
00615          * I believe Plan 9 troff allows non-ASCII characters in the names
00616          * of macros, so this test might possibly fail on such a file.
00617          */
00618         if (*ubuf == '.') {
00619                 unichar *tp = ubuf + 1;
00620 
00621                 while (ISSPC(*tp))
00622                         ++tp;   /* skip leading whitespace */
00623                 if ((tp[0] == '\\' && tp[1] == '\"') ||
00624                     (isascii(tp[0]) && isalnum(tp[0]) &&
00625                      isascii(tp[1]) && isalnum(tp[1]) &&
00626                      ISSPC(tp[2]))) {
00627                         subtype_mime = "text/troff";
00628                         subtype = "troff or preprocessor input";
00629                         goto subtype_identified;
00630                 }
00631         }
00632 
00633         if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
00634                 subtype_mime = "text/fortran";
00635                 subtype = "fortran program";
00636                 goto subtype_identified;
00637         }
00638 
00639         /* look for tokens from names.h - this is expensive! */
00640 
00641         i = 0;
00642         while (i < ulen) {
00643                 int end;
00644 
00645                 /*
00646                  * skip past any leading space
00647                  */
00648                 while (i < ulen && ISSPC(ubuf[i]))
00649                         i++;
00650                 if (i >= ulen)
00651                         break;
00652 
00653                 /*
00654                  * find the next whitespace
00655                  */
00656                 for (end = i + 1; end < nb; end++)
00657                         if (ISSPC(ubuf[end]))
00658                                 /*@innerbreak@*/ break;
00659 
00660                 /*
00661                  * compare the word thus isolated against the token list
00662                  */
00663 /*@-sizeoftype@*/
00664                 for (p = names; p < names + NNAMES; p++)
00665 /*@=sizeoftype@*/
00666                 {
00667                         if (p->name == NULL)
00668                                 /*@innerbreak@*/ break;
00669                         if (fmagicAMatch(p->name, ubuf + i, end - i)) {
00670                                 subtype = types[p->type].human;
00671                                 subtype_mime = types[p->type].mime;
00672                                 goto subtype_identified;
00673                         }
00674                 }
00675 
00676                 i = end;
00677         }
00678 
00679 subtype_identified:
00680 
00681         /*
00682          * Now try to discover other details about the file.
00683          */
00684         for (i = 0; i < ulen; i++) {
00685                 if (i > last_line_end + MAXLINELEN)
00686                         has_long_lines = 1;
00687 
00688                 if (ubuf[i] == '\033')
00689                         has_escapes = 1;
00690                 if (ubuf[i] == '\b')
00691                         has_backspace = 1;
00692 
00693                 if (ubuf[i] == '\r' && (i + 1 <  ulen && ubuf[i + 1] == '\n')) {
00694                         n_crlf++;
00695                         last_line_end = i;
00696                 }
00697                 if (ubuf[i] == '\r' && (i + 1 >= ulen || ubuf[i + 1] != '\n')) {
00698                         n_cr++;
00699                         last_line_end = i;
00700                 }
00701                 if (ubuf[i] == '\n' && (i - 1 <  0    || ubuf[i - 1] != '\r')) {
00702                         n_lf++;
00703                         last_line_end = i;
00704                 }
00705                 if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
00706                         n_nel++;
00707                         last_line_end = i;
00708                 }
00709         }
00710 
00711         if ((fm->flags & FMAGIC_FLAGS_MIME)) {
00712                 if (subtype_mime != NULL)
00713                         file_printf(fm, subtype_mime);
00714                 else
00715                         file_printf(fm, "text/plain");
00716 
00717                 if (code_mime != NULL) {
00718                         file_printf(fm, "; charset=");
00719                         file_printf(fm, code_mime);
00720                 }
00721         } else {
00722                 file_printf(fm, code);
00723 
00724                 if (subtype != NULL) {
00725                         file_printf(fm, " ");
00726                         file_printf(fm, subtype);
00727                 }
00728                 file_printf(fm, " ");
00729                 file_printf(fm, type);
00730 
00731                 if (has_long_lines)
00732                         file_printf(fm, ", with very long lines");
00733 
00734                 /*
00735                  * Only report line terminators if we find one other than LF,
00736                  * or if we find none at all.
00737                  */
00738                 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
00739                     (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
00740                         file_printf(fm, ", with");
00741 
00742                         if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)
00743                                 file_printf(fm, " no");
00744                         else {
00745                                 if (n_crlf) {
00746                                         file_printf(fm, " CRLF");
00747                                         if (n_cr || n_lf || n_nel)
00748                                                 file_printf(fm, ",");
00749                                 }
00750                                 if (n_cr) {
00751                                         file_printf(fm, " CR");
00752                                         if (n_lf || n_nel)
00753                                                 file_printf(fm, ",");
00754                                 }
00755                                 if (n_lf) {
00756                                         file_printf(fm, " LF");
00757                                         if (n_nel)
00758                                                 file_printf(fm, ",");
00759                                 }
00760                                 if (n_nel)
00761                                         file_printf(fm, " NEL");
00762                         }
00763 
00764                         file_printf(fm, " line terminators");
00765                 }
00766 
00767                 if (has_escapes)
00768                         file_printf(fm, ", with escape sequences");
00769                 if (has_backspace)
00770                         file_printf(fm, ", with overstriking");
00771         }
00772 
00773         return 1;
00774 }
00775 /*@=bounds@*/

Generated on Tue Aug 23 16:56:36 2005 for rpm by doxygen 1.3.5