00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 
00033 
00034 
00035 
00036 
00037 
00038 
00039 #include "file.h"
00040 #include "magic.h"
00041 #include <stdio.h>
00042 #include <string.h>
00043 #include <memory.h>
00044 #include <ctype.h>
00045 #include <stdlib.h>
00046 #ifdef HAVE_UNISTD_H
00047 #include <unistd.h>
00048 #endif
00049 #include "names.h"
00050 
00051 #ifndef lint
00052 FILE_RCSID("@(#)$Id: ascmagic.c,v 1.43 2005/06/25 15:52:14 christos Exp $")
00053 #endif  
00054 
00055 typedef unsigned long unichar;
00056 
00057 #define MAXLINELEN 300  
00058 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
00059                   || (x) == 0x85 || (x) == '\f')
00060 
00061 private int looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00062         ;
00063 private int looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00064         ;
00065 private int looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00066         ;
00067 private int looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00068         ;
00069 private int looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00070         ;
00071 private void from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
00072         ;
00073 private int ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
00074         ;
00075 
00076 
00077 protected int
00078 file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
00079 {
00080         size_t i;
00081         unsigned char *nbuf = NULL;
00082         unichar *ubuf = NULL;   
00083         size_t ulen;
00084         struct names *p;
00085         int rv = -1;
00086 
00087         const char *code = NULL;
00088         const char *code_mime = NULL;
00089         const char *type = NULL;
00090         const char *subtype = NULL;
00091         const char *subtype_mime = NULL;
00092 
00093         int has_escapes = 0;
00094         int has_backspace = 0;
00095         int seen_cr = 0;
00096 
00097         int n_crlf = 0;
00098         int n_lf = 0;
00099         int n_cr = 0;
00100         int n_nel = 0;
00101 
00102         int last_line_end = -1;
00103         int has_long_lines = 0;
00104 
00105         
00106 
00107 
00108 
00109         while (nbytes > 1 && buf[nbytes - 1] == '\0')
00110                 nbytes--;
00111 
00112         if ((nbuf = malloc((nbytes + 1) * sizeof(nbuf[0]))) == NULL)
00113                 goto done;
00114         if ((ubuf = malloc((nbytes + 1) * sizeof(ubuf[0]))) == NULL)
00115                 goto done;
00116 
00117         
00118 
00119 
00120 
00121 
00122 
00123         if (looks_ascii(buf, nbytes, ubuf, &ulen)) {
00124                 code = "ASCII";
00125                 code_mime = "us-ascii";
00126                 type = "text";
00127         } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
00128                 code = "UTF-8 Unicode";
00129                 code_mime = "utf-8";
00130                 type = "text";
00131         } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
00132                 if (i == 1)
00133                         code = "Little-endian UTF-16 Unicode";
00134                 else
00135                         code = "Big-endian UTF-16 Unicode";
00136 
00137                 type = "character data";
00138                 code_mime = "utf-16";    
00139         } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) {
00140                 code = "ISO-8859";
00141                 type = "text";
00142                 code_mime = "iso-8859-1"; 
00143         } else if (looks_extended(buf, nbytes, ubuf, &ulen)) {
00144                 code = "Non-ISO extended-ASCII";
00145                 type = "text";
00146                 code_mime = "unknown";
00147         } else {
00148                 from_ebcdic(buf, nbytes, nbuf);
00149 
00150                 if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) {
00151                         code = "EBCDIC";
00152                         type = "character data";
00153                         code_mime = "ebcdic";
00154                 } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) {
00155                         code = "International EBCDIC";
00156                         type = "character data";
00157                         code_mime = "ebcdic";
00158                 } else {
00159                         rv = 0;
00160                         goto done;  
00161                 }
00162         }
00163 
00164         
00165 
00166 
00167 
00168 
00169 
00170 
00171 
00172         if (*ubuf == '.') {
00173                 unichar *tp = ubuf + 1;
00174 
00175                 while (ISSPC(*tp))
00176                         ++tp;   
00177                 if ((tp[0] == '\\' && tp[1] == '\"') ||
00178                     (isascii((unsigned char)tp[0]) &&
00179                      isalnum((unsigned char)tp[0]) &&
00180                      isascii((unsigned char)tp[1]) &&
00181                      isalnum((unsigned char)tp[1]) &&
00182                      ISSPC(tp[2]))) {
00183                         subtype_mime = "text/troff";
00184                         subtype = "troff or preprocessor input";
00185                         goto subtype_identified;
00186                 }
00187         }
00188 
00189         if ((*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
00190                 subtype_mime = "text/fortran";
00191                 subtype = "fortran program";
00192                 goto subtype_identified;
00193         }
00194 
00195         
00196 
00197         i = 0;
00198         while (i < ulen) {
00199                 size_t end;
00200 
00201                 
00202 
00203 
00204                 while (i < ulen && ISSPC(ubuf[i]))
00205                         i++;
00206                 if (i >= ulen)
00207                         break;
00208 
00209                 
00210 
00211 
00212                 for (end = i + 1; end < nbytes; end++)
00213                         if (ISSPC(ubuf[end]))
00214                                  break;
00215 
00216                 
00217 
00218 
00219                 for (p = names; p < names + NNAMES; p++) {
00220                         if (ascmatch((const unsigned char *)p->name, ubuf + i,
00221                             end - i)) {
00222                                 subtype = types[p->type].human;
00223                                 subtype_mime = types[p->type].mime;
00224                                 goto subtype_identified;
00225                         }
00226                 }
00227 
00228                 i = end;
00229         }
00230 
00231 subtype_identified:
00232 
00233         
00234 
00235 
00236         for (i = 0; i < ulen; i++) {
00237                 if (ubuf[i] == '\n') {
00238                         if (seen_cr)
00239                                 n_crlf++;
00240                         else
00241                                 n_lf++;
00242                         last_line_end = i;
00243                 } else if (seen_cr)
00244                         n_cr++;
00245 
00246                 seen_cr = (ubuf[i] == '\r');
00247                 if (seen_cr)
00248                         last_line_end = i;
00249 
00250                 if (ubuf[i] == 0x85) { 
00251                         n_nel++;
00252                         last_line_end = i;
00253                 }
00254 
00255                 
00256                 if (i > last_line_end + MAXLINELEN)
00257                         has_long_lines = 1;
00258 
00259                 if (ubuf[i] == '\033')
00260                         has_escapes = 1;
00261                 if (ubuf[i] == '\b')
00262                         has_backspace = 1;
00263         }
00264 
00265         
00266 
00267 
00268 
00269         if (seen_cr && nbytes < HOWMANY)
00270                 n_cr++;
00271 
00272         if ((ms->flags & MAGIC_MIME)) {
00273                 if (subtype_mime) {
00274                         if (file_printf(ms, subtype_mime) == -1)
00275                                 goto done;
00276                 } else {
00277                         if (file_printf(ms, "text/plain") == -1)
00278                                 goto done;
00279                 }
00280 
00281                 if (code_mime) {
00282                         if (file_printf(ms, "; charset=") == -1)
00283                                 goto done;
00284                         if (file_printf(ms, code_mime) == -1)
00285                                 goto done;
00286                 }
00287         } else {
00288                 if (file_printf(ms, code) == -1)
00289                         goto done;
00290 
00291                 if (subtype) {
00292                         if (file_printf(ms, " ") == -1)
00293                                 goto done;
00294                         if (file_printf(ms, subtype) == -1)
00295                                 goto done;
00296                 }
00297 
00298                 if (file_printf(ms, " ") == -1)
00299                         goto done;
00300                 if (file_printf(ms, type) == -1)
00301                         goto done;
00302 
00303                 if (has_long_lines)
00304                         if (file_printf(ms, ", with very long lines") == -1)
00305                                 goto done;
00306 
00307                 
00308 
00309 
00310 
00311                 if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
00312                     (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
00313                         if (file_printf(ms, ", with") == -1)
00314                                 goto done;
00315 
00316                         if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0)                        {
00317                                 if (file_printf(ms, " no") == -1)
00318                                         goto done;
00319                         } else {
00320                                 if (n_crlf) {
00321                                         if (file_printf(ms, " CRLF") == -1)
00322                                                 goto done;
00323                                         if (n_cr || n_lf || n_nel)
00324                                                 if (file_printf(ms, ",") == -1)
00325                                                         goto done;
00326                                 }
00327                                 if (n_cr) {
00328                                         if (file_printf(ms, " CR") == -1)
00329                                                 goto done;
00330                                         if (n_lf || n_nel)
00331                                                 if (file_printf(ms, ",") == -1)
00332                                                         goto done;
00333                                 }
00334                                 if (n_lf) {
00335                                         if (file_printf(ms, " LF") == -1)
00336                                                 goto done;
00337                                         if (n_nel)
00338                                                 if (file_printf(ms, ",") == -1)
00339                                                         goto done;
00340                                 }
00341                                 if (n_nel)
00342                                         if (file_printf(ms, " NEL") == -1)
00343                                                 goto done;
00344                         }
00345 
00346                         if (file_printf(ms, " line terminators") == -1)
00347                                 goto done;
00348                 }
00349 
00350                 if (has_escapes)
00351                         if (file_printf(ms, ", with escape sequences") == -1)
00352                                 goto done;
00353                 if (has_backspace)
00354                         if (file_printf(ms, ", with overstriking") == -1)
00355                                 goto done;
00356         }
00357         rv = 1;
00358 done:
00359         if (nbuf)
00360                 free(nbuf);
00361         if (ubuf)
00362                 free(ubuf);
00363 
00364         return rv;
00365 }
00366 
00367 private int
00368 ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
00369 {
00370         size_t i;
00371 
00372         for (i = 0; i < ulen; i++) {
00373                 if (s[i] != us[i])
00374                         return 0;
00375         }
00376 
00377         if (s[i])
00378                 return 0;
00379         else
00380                 return 1;
00381 }
00382 
00383 
00384 
00385 
00386 
00387 
00388 
00389 
00390 
00391 
00392 
00393 
00394 
00395 
00396 
00397 
00398 
00399 
00400 
00401 
00402 
00403 
00404 
00405 
00406 
00407 
00408 
00409 
00410 
00411 
00412 
00413 
00414 
00415 
00416 
00417 
00418 
00419 
00420 
00421 
00422 
00423 
00424 
00425 
00426 
00427 
00428 
00429 
00430 
00431 
00432 
00433 
00434 
00435 #define F 0   
00436 #define T 1   
00437 #define I 2   
00438 #define X 3   
00439 
00440  
00441 private char text_chars[256] = {
00442         
00443         F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  
00444         
00445         F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  
00446         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  
00447         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  
00448         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  
00449         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  
00450         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  
00451         T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  
00452         
00453         X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  
00454         X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  
00455         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  
00456         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  
00457         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  
00458         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  
00459         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  
00460         I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   
00461 };
00462 
00463 private int
00464 looks_ascii(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00465     size_t *ulen)
00466 {
00467         int i;
00468 
00469         *ulen = 0;
00470 
00471         for (i = 0; i < nbytes; i++) {
00472                 int t = text_chars[buf[i]];
00473 
00474                 if (t != T)
00475                         return 0;
00476 
00477                 ubuf[(*ulen)++] = buf[i];
00478         }
00479 
00480         return 1;
00481 }
00482 
00483 private int
00484 looks_latin1(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00485 {
00486         int i;
00487 
00488         *ulen = 0;
00489 
00490         for (i = 0; i < nbytes; i++) {
00491                 int t = text_chars[buf[i]];
00492 
00493                 if (t != T && t != I)
00494                         return 0;
00495 
00496                 ubuf[(*ulen)++] = buf[i];
00497         }
00498 
00499         return 1;
00500 }
00501 
00502 private int
00503 looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00504     size_t *ulen)
00505 {
00506         int i;
00507 
00508         *ulen = 0;
00509 
00510         for (i = 0; i < nbytes; i++) {
00511                 int t = text_chars[buf[i]];
00512 
00513                 if (t != T && t != I && t != X)
00514                         return 0;
00515 
00516                 ubuf[(*ulen)++] = buf[i];
00517         }
00518 
00519         return 1;
00520 }
00521 
00522 private int
00523 looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
00524 {
00525         int i, n;
00526         unichar c;
00527         int gotone = 0;
00528 
00529         *ulen = 0;
00530 
00531         for (i = 0; i < nbytes; i++) {
00532                 if ((buf[i] & 0x80) == 0) {        
00533                         
00534 
00535 
00536 
00537 
00538                         if (text_chars[buf[i]] != T)
00539                                 return 0;
00540 
00541                         ubuf[(*ulen)++] = buf[i];
00542                 } else if ((buf[i] & 0x40) == 0) { 
00543                         return 0;
00544                 } else {                           
00545                         int following;
00546 
00547                         if ((buf[i] & 0x20) == 0) {             
00548                                 c = buf[i] & 0x1f;
00549                                 following = 1;
00550                         } else if ((buf[i] & 0x10) == 0) {      
00551                                 c = buf[i] & 0x0f;
00552                                 following = 2;
00553                         } else if ((buf[i] & 0x08) == 0) {      
00554                                 c = buf[i] & 0x07;
00555                                 following = 3;
00556                         } else if ((buf[i] & 0x04) == 0) {      
00557                                 c = buf[i] & 0x03;
00558                                 following = 4;
00559                         } else if ((buf[i] & 0x02) == 0) {      
00560                                 c = buf[i] & 0x01;
00561                                 following = 5;
00562                         } else
00563                                 return 0;
00564 
00565                         for (n = 0; n < following; n++) {
00566                                 i++;
00567                                 if (i >= nbytes)
00568                                         goto done;
00569 
00570                                 if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
00571                                         return 0;
00572 
00573                                 c = (c << 6) + (buf[i] & 0x3f);
00574                         }
00575 
00576                         ubuf[(*ulen)++] = c;
00577                         gotone = 1;
00578                 }
00579         }
00580 done:
00581         return gotone;   
00582 }
00583 
00584 private int
00585 looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
00586     size_t *ulen)
00587 {
00588         int bigend;
00589         int i;
00590 
00591         if (nbytes < 2)
00592                 return 0;
00593 
00594         if (buf[0] == 0xff && buf[1] == 0xfe)
00595                 bigend = 0;
00596         else if (buf[0] == 0xfe && buf[1] == 0xff)
00597                 bigend = 1;
00598         else
00599                 return 0;
00600 
00601         *ulen = 0;
00602 
00603         for (i = 2; i + 1 < nbytes; i += 2) {
00604                 
00605 
00606                 if (bigend)
00607                         ubuf[(*ulen)++] = buf[i + 1] + 256 * buf[i];
00608                 else
00609                         ubuf[(*ulen)++] = buf[i] + 256 * buf[i + 1];
00610 
00611                 if (ubuf[*ulen - 1] == 0xfffe)
00612                         return 0;
00613                 if (ubuf[*ulen - 1] < 128 &&
00614                     text_chars[(size_t)ubuf[*ulen - 1]] != T)
00615                         return 0;
00616         }
00617 
00618         return 1 + bigend;
00619 }
00620 
00621 #undef F
00622 #undef T
00623 #undef I
00624 #undef X
00625 
00626 
00627 
00628 
00629 
00630 
00631 
00632 
00633 
00634 
00635 
00636 
00637 
00638 
00639 
00640 
00641 
00642 
00643 
00644 
00645 
00646 
00647 
00648  
00649 private unsigned char ebcdic_to_ascii[] = {
00650   0,   1,   2,   3, 156,   9, 134, 127, 151, 141, 142,  11,  12,  13,  14,  15,
00651  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
00652 128, 129, 130, 131, 132,  10,  23,  27, 136, 137, 138, 139, 140,   5,   6,   7,
00653 144, 145,  22, 147, 148, 149, 150,   4, 152, 153, 154, 155,  20,  21, 158,  26,
00654 ' ', 160, 161, 162, 163, 164, 165, 166, 167, 168, 213, '.', '<', '(', '+', '|',
00655 '&', 169, 170, 171, 172, 173, 174, 175, 176, 177, '!', '$', '*', ')', ';', '~',
00656 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
00657 186, 187, 188, 189, 190, 191, 192, 193, 194, '`', ':', '#', '@', '\'','=', '"',
00658 195, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 196, 197, 198, 199, 200, 201,
00659 202, 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', '^', 204, 205, 206, 207, 208,
00660 209, 229, 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 210, 211, 212, '[', 214, 215,
00661 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, ']', 230, 231,
00662 '{', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 232, 233, 234, 235, 236, 237,
00663 '}', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 238, 239, 240, 241, 242, 243,
00664 '\\',159, 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 244, 245, 246, 247, 248, 249,
00665 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
00666 };
00667 
00668 #ifdef notdef
00669 
00670 
00671 
00672 
00673 
00674 
00675 
00676 
00677 
00678 
00679 
00680 
00681 
00682 
00683  
00684 private unsigned char ebcdic_1047_to_8859[] = {
00685 0x00,0x01,0x02,0x03,0x9C,0x09,0x86,0x7F,0x97,0x8D,0x8E,0x0B,0x0C,0x0D,0x0E,0x0F,
00686 0x10,0x11,0x12,0x13,0x9D,0x0A,0x08,0x87,0x18,0x19,0x92,0x8F,0x1C,0x1D,0x1E,0x1F,
00687 0x80,0x81,0x82,0x83,0x84,0x85,0x17,0x1B,0x88,0x89,0x8A,0x8B,0x8C,0x05,0x06,0x07,
00688 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
00689 0x20,0xA0,0xE2,0xE4,0xE0,0xE1,0xE3,0xE5,0xE7,0xF1,0xA2,0x2E,0x3C,0x28,0x2B,0x7C,
00690 0x26,0xE9,0xEA,0xEB,0xE8,0xED,0xEE,0xEF,0xEC,0xDF,0x21,0x24,0x2A,0x29,0x3B,0x5E,
00691 0x2D,0x2F,0xC2,0xC4,0xC0,0xC1,0xC3,0xC5,0xC7,0xD1,0xA6,0x2C,0x25,0x5F,0x3E,0x3F,
00692 0xF8,0xC9,0xCA,0xCB,0xC8,0xCD,0xCE,0xCF,0xCC,0x60,0x3A,0x23,0x40,0x27,0x3D,0x22,
00693 0xD8,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0xAB,0xBB,0xF0,0xFD,0xFE,0xB1,
00694 0xB0,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,0x70,0x71,0x72,0xAA,0xBA,0xE6,0xB8,0xC6,0xA4,
00695 0xB5,0x7E,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7A,0xA1,0xBF,0xD0,0x5B,0xDE,0xAE,
00696 0xAC,0xA3,0xA5,0xB7,0xA9,0xA7,0xB6,0xBC,0xBD,0xBE,0xDD,0xA8,0xAF,0x5D,0xB4,0xD7,
00697 0x7B,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0xAD,0xF4,0xF6,0xF2,0xF3,0xF5,
00698 0x7D,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,0x50,0x51,0x52,0xB9,0xFB,0xFC,0xF9,0xFA,0xFF,
00699 0x5C,0xF7,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5A,0xB2,0xD4,0xD6,0xD2,0xD3,0xD5,
00700 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0xB3,0xDB,0xDC,0xD9,0xDA,0x9F
00701 };
00702 #endif
00703 
00704 
00705 
00706 
00707 private void
00708 from_ebcdic(const unsigned char *buf, size_t nbytes, unsigned char *out)
00709 {
00710         int i;
00711 
00712         for (i = 0; i < nbytes; i++) {
00713                 out[i] = ebcdic_to_ascii[buf[i]];
00714         }
00715 }