/* * sep2stata.c This program was written by Phil Goldberg in 1992, while * working for the Children's Memorial Medical Center, * Cardiopulmonary Bioengineering Laboratory, Chicago, IL. * * Reach the author at: * (312) 880-4730 * * It is fairly straightforward, reading * from a file and writing to stdout. It reformats a file * that has data separated by commas, vertical bars, backslashes, * or tabs so that it is acceptable for reading by stata. * * 1. If the first line begins with a quoted string, * it assumes that the top line is a list * of variable names for the columns. If not, it assumes that * the first line contains data. * * 2. The program makes two passes over the data, the first pass * lets the program know: * A. Which columns contain strings and * which columns contain numbers; * B. How wide the widest string in each string column is; and * C. Whether a column contains many strings (> 50%) or * not, so that non-string columns will be declared * properly. * * 3. In the second phase, conversion actually takes place: * A. If a line of variable names existed at the top of the * file, we skip the first line and write out a data * dictionary based on those definitions for stata to use. * If there were no variable names or not enough for all * of the columns of data, we write "labeln" as the * dictionary entry for the nth variable (e.g., label74). * B. We read in each data line, converting whatever * separator there is to a tab. * C. NA -> . * D. "" -> . * * * compilation: (under unix) cc sep2stata.c -o sep2stata * * use: sep2stata input_file > output_file.dct * where 'input_file' is a file containing values separated by * commas, tabs, backslashes, slashes or vertical bars, * and 'output_file'.dct is the name of a file for stata to use * with the "infile using" command. * * Dedication: This program is dedicated to the memory of David * Sellars, a victim of Cystic Fibrosis and a dear * friend. If you find this software useful, please * send a donation to the Cystic Fibrosis Foundation. * * The Cystic Fibrosis Foundation * 6931 Arlington Road * Bethesda, MD, 20814 * 800-344-4823 * */ #define HICOL 1000 #include FILE *inpf; /* file we read data from */ char inpline[10000]; /* a really long input line */ char *label[HICOL]; /* Labels for each column */ float type[HICOL]; /* a running avg of the column type */ int volume[HICOL]; /* the number of column entries */ int width[HICOL]; /* the widest entry in any column */ int numlabels; /* the number of labels in the inpf */ int maxcol; /* the actual largest # of cols */ int spare; /* used for dealing with dup labels */ int header; /* Boolean - whether a header exists */ char *string_space(); /* function to malloc memory for labels */ main(argc, argv) int argc; char *argv[]; { int index; int done; char c; if (argc < 2) { printf("usage: %s input_file\n", argv[0]); exit(-1); } if ((inpf = fopen(argv[1], "r")) == NULL) { printf("Couldn't open file %s\n", argv[1]); exit(-1); } for (index = 0 ; index < HICOL ; index++) { width[index] = type[index] = volume[index] = 0; label[index] = (char *)NULL; } spare = 0; maxcol = -1; readln(inpline); header = headercheck(inpline); if (header) numlabels = maxcol = parse_header(inpline); else parse_line(inpline); while (!feof(inpf)) { readln(inpline); parse_line(inpline); } fclose(inpf); if (maxcol >= 0) write_dictionary(); if ((inpf = fopen(argv[1], "r")) == NULL) { printf("Couldn't open file %s\n", argv[1]); exit(-1); } if (header) readln(inpline); /* skip the header on 2nd pass */ while (!feof(inpf)) { readln(inpline); reform_line(inpline); } } /* main */ /* * * headercheck(inpline) routine checks to see if the first entry (not * separator character) is a double-quote. If so, * routine returns a TRUE value. Otherwise, a FALSE * value will be returned. * */ int headercheck(inpline) char *inpline; { int header; int done; int index; char c; header = (1 == 0); done = (1 == 0); index = 0; while ((inpline[index] != '\0') && (!done)) { c = inpline[index]; if ((c != ',') && (c != ' ') && (c != '\t') && (c != '|') && (c !='\\') && (c != '"')) { done = (1 == 1); } else if (c == '"') { header = (1 == 1); done = (1 == 1); } index++; } return(header); } parse_header(line) char *line; { int count; /* how many columns there are */ int index; int length; int bufind; int nolabel; int i; int j; char c; char buffer[200]; /* label length max! */ length = strlen(line); index = 0; count = 0; nolabel = (1 == 1); while (index < length) { c = line[index]; if (issep(c)) { if (nolabel) /* we ran into a separator without first */ { /* running into a label for the column */ buffer[0] = '\0'; label[count] = string_space(buffer); strcpy(label[count], buffer); /* Set it to NULL */ } index++; count++; nolabel = (1 == 1); } else if (c == '"') /* a quoted string */ { index++; bufind = 0; while ((index < length) && (line[index] != '"')) { buffer[bufind] = line[index]; bufind++; index++; } buffer[bufind] = '\0'; index++; cleanup(buffer, count); label[count] = string_space(buffer); strcpy(label[count], buffer); nolabel = (1 == 0); } else if (c != ' ') /* assume unquoted string */ { bufind = 0; while ((index < length) && (!(issep(line[index])))) { buffer[bufind] = line[index]; bufind++; index++; } buffer[bufind] = '\0'; cleanup(buffer, count); label[count] = string_space(buffer); strcpy(label[count], buffer); nolabel = (1 == 0); } else index++; /* skip blanks */ } /* while index < length */ return(count); } cleanup(buffer, count) char *buffer; int count; { int i,j; if (strlen(buffer) > 8) buffer[8] = '\0'; for (i = 0 ; i < strlen(buffer) ; i++) { if ((buffer[i] >= 'A') && (buffer[i] <= 'Z')) /* lower case */ buffer[i] = (char) (buffer[i] + ('a' - 'A')); if ((buffer[i] == ' ') || (buffer[i] == '-') || (buffer[i] == '/')) buffer[i] = '_'; if ((buffer[i] == '^') || (buffer[i] == '<') || (buffer[i] == '*') || (buffer[i] == '>') || (buffer[i] == '#') || (buffer[i] == '@') || (buffer[i] == '\\') || (buffer[i] == '%')) buffer[i] = 'x'; } /* * don't let variable names start with numbers or periods. */ if ((buffer[0] >= '0') && (buffer[0] <= '9')) buffer[0] = buffer[0] - '0' + 'a'; if (buffer[0] == '.') buffer[0] = 'a'; /* * * weed out duplicate column names in the following code section. * BUT! leave blank labels in. Why? Because in the writing phase, * any label that is blank will be filled in with labeln. So let it * do that. NULL labels are not considered as equivalent to eachother. * */ for (j = 0 ; j < count ; j++) { if ((buffer[0] != '\0') && (strcmp(label[j], buffer) == 0)) { strcpy(buffer, "ditto"); buffer[5] = 'a' + spare; buffer[6] = '\0'; spare++; } } } /* * parse_line(line) the purpose of this routine is to evaluate the lines * in the input file and figure out what type of value * is stored in each column (be it numeric or alpha), * and to determine how many columns are on that line. * parse_line is performed during the first pass over * the data file. */ parse_line(line) char *line; { int count; /* how many columns there are */ int index; int length; int bufind; int i; char c; char buffer[200]; /* label length max! */ /* note lack of bounds checks */ float fracta; /* used in computing the */ float fractb; /* running averages for */ /* column typing */ float data_type_val; length = strlen(line); index = 0; count = 0; while (index < length ) { c = line[index]; if (c == (char) 0xff) /* sort of an end of file condition */ return(0); if (issep(c)) { index++; count++; } else if (c == '"') { index++; bufind = 0; while ((index < length) && (line[index] != '"')) { buffer[bufind] = line[index]; bufind++; index++; } buffer[bufind] = '\0'; data_type_val = (isalpha(buffer) ? 1.0 : 0.0); /* * Why check on a quoted string? Because it may be a number * that somehow got quoted by some other program. So, if it * is, we ignore that it is quoted and claim that it is a numeric */ fracta = (type[count] * (float) volume[count]) / (float) (volume[count] + 1); fractb = data_type_val / (float) (volume[count] + 1); type[count] = fracta + fractb; volume[count] = volume[count] + 1; width[count] = max(width[count], strlen(buffer)); index++; } else /* it is unquoted - is it a string or a number? */ { bufind = 0; while ((index < length) && (!(issep(line[index])))) { buffer[bufind] = line[index]; bufind++; index++; } buffer[bufind] = '\0'; data_type_val = (isalpha(buffer) ? 1.0 : 0.0); fracta = (type[count] * (float) volume[count]) / (float) (volume[count] + 1); fractb = data_type_val / (float) (volume[count] + 1); type[count] = fracta + fractb; volume[count] = volume[count] + 1; width[count] = max(width[count], strlen(buffer)); } } /* while index < length */ maxcol = max(count, maxcol); } /* parse_line */ /* * reform_line(line) this routine is the meat of the transformation * and runs during the second pass over the data * file. It converts separators to tabs, quotes * strings, unquotes non-strings, and puts in periods * for missing data in the middle of lines. Lines * that do not contain as many columns as they should * are *not* completed with "."'s because Stata * moves on to the next observation when it hits the * end of line. */ reform_line(line) char *line; { int count; /* how many columns there are */ int index; int length; int bufind; int i; char c; char buffer[200]; /* label length max! */ if ((length = strlen(line)) == 0) return(0); index = 0; count = 0; while (index < length) { c = line[index]; if (c == (char) 0xff) /* eof condition on isc unix */ return(0); if (issep(c)) { index++; count++; printf(".\t"); } else if (c == '"') { index++; /* move to 1st char */ bufind = 0; while ((index < length) && (line[index] != '"')) { buffer[bufind] = line[index]; bufind++; index++; } buffer[bufind] = '\0'; if ((strlen(buffer) == 0) || (strcmp(buffer, "NA") == 0)) { buffer[0] = '.'; buffer[1] = '\0'; } printf("\"%s\"", buffer); index++; /* skip quote */ if (index < length) printf("\t"); index++; /* skip separator */ } else /* it is unquoted - is it a string or a number? */ { bufind = 0; while ((index < length) && (!(issep(line[index])))) { buffer[bufind] = line[index]; bufind++; index++; } index++; /* skip separators */ buffer[bufind] = '\0'; /* check here for trailing blanks */ bufind--; /* step back from terminator */ while ((bufind >= 0) && (buffer[bufind] == ' ')) { buffer[bufind] = '\0'; bufind--; } if ((strlen(buffer) == 0) || (strcmp(buffer, "NA") == 0)) strcpy(buffer, "."); if (isalpha(buffer)) { printf("\"%s\"", buffer); if (index < length) printf("\t"); } else { printf("%s", buffer); if (index < length) printf("\t"); } } } /* while index < length */ printf("\n"); } int isalpha(element) char *element; { int result = (1 == 0); /* assume it is numeric */ int i; if (strcmp(element, "NA") == 0) return(result); for (i = 0 ; i < strlen(element) ; i++) { if ((element[i] < '0') || (element[i] > '9')) { if ((!((element[i] == '-') && (i == 0))) && (element[i] != '.')) result = (1 == 1); } } return(result); } int issep(c) char c; { return ((c == '\\') || (c == '\n') || (c == '\t') || (c == '\0') || (c == '|') || (c == ',')); } readln(line) char *line; { int index; index = 0; while ((!feof(inpf)) && ((line[index] = fgetc(inpf)) != '\n')) index++; line[index] = '\0'; } write_dictionary() { int index; printf("dictionary {\n"); for (index = 0 ; index <= maxcol ; index++) { if (type[index] > 0.5) /* > 50% strings? */ printf("str%d ", max(width[index], 1)); if ((index > numlabels) || (!header) || (label[index] == NULL) || (*label[index] == '\0')) printf("label%d\n",index); else printf("%s\n", label[index]); } printf("}\n"); } char *string_space(buffer) char *buffer; { char *result; if ((result = (char *)malloc(strlen(buffer)+2)) == (char *)NULL) { printf("could not malloc needed memory for string\n"); exit(-1); } return(result); } max(a,b) int a; int b; { return(((a > b) ? a : b)); }