/* filesrta.c */ /* COPYRIGHT (C) 1996-2002 Norbert H. Doerry SYNTAX FILESRTA Infile -oOutfile -fXY where Infile = Filename of input text file containing records delimited by newline characters and fields delimited by whitespace characters. Blank lines and lines starting with ! are ignored. Outfile = Output File sorted. If omitted, Outfile is printed to the standard output. -fXY = Field to sort to. X is the field number starting with 1. Fields are separated by whitespaces Y is either A = alphabetic sorting (default) N = numeric sorting Filesrta.exe can handle large files because it only stores in memory the actual field sorted on, and the offset in the file to the start of the line. This slows down the printing of the output file somewhat, but enables large files to be sorted. Lines can not exceed 1024 characters Version 1.0 December 1996 - originial version Version 1.0a January 1997 - fixed minor bugs Version 2.0 August 1997 - Changed the sorting algorithm to increase speed. Version 2.0a June 2002 - Added GNU GPL, recompile using 32 bit compiler fixed type-casting error. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. to report bugs, send an email to doerry@aol.com */ #include #include #include #include #include #define VERSION "2.0" #define MAXCHAR 1024 #define SORT_ALPHA 1 #define SORT_NUMERIC 2 #define DEBUG 1 /**********************************************************************/ typedef struct FileSort { char *Infile; FILE *in; char *Outfile; FILE *out; int sort_field; int sort_type; struct List *l; int debug; } FILESORT; typedef struct Element { char *s; /* character string to sort on */ long offset; /* offset in file for line with the parent search fields */ struct Element *next; } ELEMENT; typedef struct List { struct Element *e; struct Element *last; /* pointer to last Element in the List */ long count; /* count the number of offsets */ } LIST; /*******************************************************************/ void initialize_filesort(FILESORT *); void read_command_line(FILESORT *,int,char **); void print_help(void); void print_debug(FILESORT *); char *copystring(char *); char *strextract(char *,int); void strstrip(char *); void read_file(FILESORT *); void store_line(FILESORT *,char *,long); int compare(char *,char *,int ); ELEMENT *alloc_ELEMENT(void); void print_list(LIST *); void print_output_file(FILESORT *); void print_output_file_alpha(FILESORT *); void print_output_file_numeric(FILESORT *); void sort_sublist(LIST *,int,int); void sort_list(LIST *,int,int); /*******************************************************************/ int main(int argc, char **argv) { FILESORT fs; initialize_filesort(&fs); read_command_line(&fs,argc,argv); if (fs.debug) print_debug(&fs); if (fs.out != stdout) printf("Done Reading Command Line\n"); if (fs.out != stdout) printf("Reading File %s\n",fs.Infile); read_file(&fs); if (fs.out != stdout) printf("Done Reading File %s\n",fs.Infile); if (fs.debug > 4) print_list(fs.l); if (fs.out != stdout) printf("Sorting %ld records\n",fs.l->count); sort_list(fs.l,fs.sort_type,fs.debug); if (fs.out != stdout) printf("Done Sorting %ld records\n",fs.l->count); if (fs.out != stdout) printf("Writing File %s\n",fs.Outfile); print_output_file(&fs); if (fs.out != stdout) printf("Done Writing File %s\n",fs.Outfile); return EXIT_SUCCESS; } void initialize_filesort(FILESORT *fs) { fs->Infile = (char *) NULL; fs->in = (FILE *) NULL; fs->Outfile = (char *) NULL; fs->out = (FILE *) NULL; fs->sort_field = 1; fs->sort_type = SORT_ALPHA; fs->l = (LIST *) calloc((size_t) 1, sizeof(LIST)); if (fs->l == (LIST *) NULL) { fprintf(stderr, " ***ERROR: Out of Memory in initialize_filesort()<%d>\n",__LINE__); exit(EXIT_FAILURE); }; fs->l->e = (ELEMENT *) NULL; fs->l->last = (ELEMENT *) NULL; fs->l->count = 0l; fs->debug = 0; } void read_command_line(FILESORT *fs,int argc,char **argv) { int i,j; for (i=1 ; i < argc ; i++) { if (argv[i][0] == '-' || argv[i][0] == '/') { if (argv[i][1] == 'o' || argv[i][1] == 'O') { if (fs->out != (FILE *) NULL) { fprintf(stderr," ***ERROR: Output file multiply defined\n"); exit(EXIT_FAILURE); } fs->Outfile = copystring(argv[i]+2); fs->out = fopen(fs->Outfile,"w"); if (fs->out == (FILE *) NULL) { fprintf(stderr," ***ERROR: Unable to open Output File %s\n", fs->Outfile); exit(EXIT_FAILURE); } } else if (argv[i][1] == 'f' || argv[i][1] == 'F') { fs->sort_field = 0; fs->sort_type = SORT_ALPHA; for (j=2 ; isdigit((int)argv[i][j]) ; j++) { fs->sort_field = 10*fs->sort_field + (int)(argv[i][j] - '0'); } if (fs->sort_field < 1) fs->sort_field = 1; if (argv[i][j] == 'a' || argv[i][j] == 'A') fs->sort_type = SORT_ALPHA; else if (argv[i][j] == 'n' || argv[i][j] == 'N') fs->sort_type = SORT_NUMERIC; } else if (argv[i][1] == 'd' || argv[i][1] == 'D') { if (isdigit((int)argv[i][2])) { fs->debug = (int)(argv[i][2]-'0'); } else fs->debug = 1; } else if (argv[i][1] == 'h' || argv[i][1] == 'H' || argv[i][1] == '?') { print_help(); exit(EXIT_SUCCESS); } } else /* input filename */ { if (fs->in != (FILE *) NULL) { fprintf(stderr," ***ERROR: Input file multiply defined\n"); exit(EXIT_FAILURE); } fs->Infile = copystring(argv[i]); fs->in = fopen(fs->Infile,"r"); if (fs->in == (FILE *) NULL) { fprintf(stderr," ***ERROR: Unable to open Input File %s\n", fs->Infile); exit(EXIT_FAILURE); } } } if (fs->in == (FILE *) NULL) { fs->in = stdin; fs->Infile = copystring(""); } if (fs->out == (FILE *) NULL) { fs->out = stdout; fs->Outfile = copystring(""); } } void print_help(void) { printf(" FILESRTA version %s <%s>\n\n",VERSION,__DATE__); printf(" SYNTAX: FILESRTA Infile -oOutfile -fXY\n"); printf("\n"); printf(" where\n\n"); printf(" Infile = Filename of input text file containing records\n"); printf(" delimited by newline characters and fields delimited\n"); printf(" by whitespace characters. Blank lines and lines\n"); printf(" starting with ! are ignored.\n"); printf(" Outfile = Output File sorted. If omitted, Outfile is printed\n"); printf(" to the standard output.\n"); printf(" -fXY = Field to sort to.\n"); printf(" X is the field number starting with 1.\n"); printf(" Fields are separated by whitespaces.\n"); printf(" Y is either:\n"); printf(" A = alphabetic sorting (default)\n"); printf(" N = numeric sorting\n"); } void print_debug(FILESORT *fs) { printf("Infile = %s\n",fs->Infile); printf("Outfile = %s\n",fs->Outfile); printf("sort_field = %d\n",fs->sort_field); printf("sort_type = %d (%d = SORT_ALPHA, %d = SORT_NUMERIC)\n", fs->sort_type,SORT_ALPHA,SORT_NUMERIC); printf("l->count = %ld\n",fs->l->count); } char *copystring(char *s) { char *ps; if (s == (char *) NULL) ps = (char *) calloc((size_t) 1,sizeof(char)); else ps = (char *) calloc(strlen(s)+1,sizeof(char)); if (ps == (char *) NULL) { fprintf(stderr," ***ERROR: OUT OF MEMORY IN COPYSTRING() <%d>\n",__LINE__); exit(EXIT_FAILURE); } if (s == (char *) NULL) *ps = (char) NULL; else { strcpy(ps,s); } return ps; } char *strextract(char *s,int n) { char *ps,*sn; int ncnt; int slen; int i; for (ncnt = 1,ps = s ; ncnt < n ; ncnt++) { /* skip leading spaces */ while (isspace((int)(*ps))) ps++; /* go past the word */ while (isspace((int)(*ps)) == 0 && *ps != (char) NULL) ps++; } /* skip leading spaces */ while(isspace((int)(*ps))) ps++; /* find the length of the nth word */ for (slen = 0 ; isspace((int)ps[slen]) == 0 && ps[slen] != (char) NULL ; slen++); /* allocate the answer */ sn = (char *) malloc((size_t)((slen+1)*sizeof(char))); if (sn == (char *) NULL) { fprintf(stderr," ***ERROR: Out of Memory in strextract() <%d>\n",__LINE__); exit(EXIT_FAILURE); } /* see if zero length line */ if (slen == 0) { sn[0] = (char) NULL; return sn; } /* copy the string */ for (i=0 ; i < slen ; i++,ps++) sn[i] = *ps; sn[slen] = (char) NULL; return sn; } void strstrip(char *s) { int i,j; if (s == (char *) NULL) return; /* find first non space */ for (i=0;s[i] != (char) NULL && isspace((int)s[i]) ; i++); /* move the entire string left */ for (j=0;s[i] != (char) NULL ;i++,j++) s[j] = s[i]; s[j] = (char) NULL; /* strip trailing spaces */ for (j-- ; j >= 0 && isspace((int)s[j]) ; j--) s[j] = (char) NULL; } void read_file(FILESORT *fs) { char rdline[MAXCHAR]; long offset; int cnt; cnt = 0; offset = ftell(fs->in); /* get the position in the file for the line */ while (fgets(rdline,MAXCHAR-1,fs->in)) { if (rdline[0] != '!') /* skip comment lines */ { strstrip(rdline); /* eliminate leading and trailing whitespace */ if (rdline[0] != (char) NULL) /* skip blank lines */ { store_line(fs,rdline,offset); if (fs->debug == 4) printf("OFFSET = %ld <> RDLINE = %s\n",offset,rdline); else if (fs->out != stdout) { cnt++; if (cnt % 200 == 0) { printf("."); fflush(stdout); } } } } offset = ftell(fs->in); /* get the new position of the file */ } /* rewind the file: move the filepointer to the beginning */ rewind(fs->in); if (fs->out != stdout) printf("\n"); } void store_line(FILESORT *fs,char *rdline,long offset) { char *ps; ELEMENT *e,*ee,*le; int i; ps = strextract(rdline,fs->sort_field); /* find the field */ if (ps[0] == (char) NULL) /* don't store zero length elements */ { free((void *) ps); return; } /* see if the first entry */ if (fs->l->e == (ELEMENT *) NULL) { fs->l->e = alloc_ELEMENT(); fs->l->e->s = ps; fs->l->e->offset = offset; fs->l->last = fs->l->e; } else { /* otherwise should be the last */ fs->l->last->next = alloc_ELEMENT(); fs->l->last->next->s = ps; fs->l->last->next->offset = offset; fs->l->last = fs->l->last->next; } fs->l->count++; } #define MIN_ELEMENT 50 void sort_list(LIST *ls, int sort_type,int debug) { LIST **list; int nbr_lists; int nbr_element; int i,j; ELEMENT *e, *ee; int min; /* see if only need to sort this one list */ if (ls->count <= MIN_ELEMENT) { sort_sublist(ls,sort_type,debug); return; } /* calculate the number of elements in each sub-list */ nbr_element = (int) ceil(sqrt((double)ls->count)); if (nbr_element < MIN_ELEMENT) nbr_element = MIN_ELEMENT; /* calculate the number of sub-lists */ nbr_lists = (int) ceil((double) ls->count / (double) nbr_element); if (debug) printf("nbr_lists = %d\nnbr_element = %d\n",nbr_lists,nbr_element); /* allocate the sub-list array */ list = (LIST **)calloc((size_t) nbr_lists, sizeof(LIST *)); if (list == (LIST **) NULL) { fprintf(stderr,"*** ERROR : Out of memory in sort_list() <%d>\n",__LINE__); exit(EXIT_FAILURE); } /* allocate the sub-lists */ for (i = 0 ; i < nbr_lists ; i++) { list[i] = (LIST *)calloc((size_t) 1, sizeof(LIST)); if (list[i] == (LIST *) NULL) { fprintf(stderr,"*** ERROR : Out of memory in sort_list() <%d>\n",__LINE__); exit(EXIT_FAILURE); } list[i]->e = (ELEMENT *) NULL; list[i]->last = (ELEMENT *) NULL; list[i]->count = 0l; } /* break up the list into sub-lists */ for (i = 0, e = ls->e ; i < nbr_lists && e != (ELEMENT *) NULL ; i++) { list[i]->e = e; if (list[i]->e != (ELEMENT *) NULL ) list[i]->count = 1l; for (j = 1 ; j < nbr_element && e != (ELEMENT *) NULL; j++) { e = e->next; list[i]->count++; } if (e != (ELEMENT *)NULL) { ee = e->next; e->next = (ELEMENT *) NULL; e = ee; } else { list[i]->count--; } } if (debug) printf("sub-lists created, starting to sort sub-lists\n"); /* sort each sub-list */ for (i = 0 ; i < nbr_lists ; i++) { if (list[i] == (LIST *) NULL) break; if (debug) printf("list[%d]->count = %ld\n",i,list[i]->count); if (list[i]->count > 1l) sort_list(list[i],sort_type,debug); } if (debug) printf("sub-lists sorted, assembling the list\n"); /* assemble the list */ ls->e = (ELEMENT *) NULL; ls->last = (ELEMENT *) NULL; ls->count = 0l; while (1) { /* find the first element */ for (min = 0 ; min < nbr_lists ; min++) if (list[min]->count > 0) break; /* see if done sorting */ if (min == nbr_lists) { break; } /* find the minimum */ for (i = min + 1 ; i < nbr_lists ; i++) { if (list[i]->count <= 0) continue; if (compare(list[min]->e->s,list[i]->e->s,sort_type) > 0) { min = i; } } /* insert the minimum onto the end of the list */ if (ls->e == (ELEMENT *) NULL) { ls->e = list[min]->e; ls->last = ls->e; } else { ls->last->next = list[min]->e; ls->last = ls->last->next; } ls->count++; /* take the element out of the sublist */ list[min]->e = list[min]->e->next; list[min]->count--; /* NULL terminate the list */ ls->last->next = (ELEMENT *) NULL; } } void sort_sublist(LIST *ls,int sort_type,int debug) { ELEMENT *e,*ne,*be; /* set the base */ be = ls->e; ls->e = (ELEMENT *) NULL; while (be != (ELEMENT *) NULL) { ne = be->next; /* see if the only element in the new list */ if (ls->e == (ELEMENT *) NULL) { ls->e = be; be->next = (ELEMENT *) NULL; } /* see if the first element in the new list */ else if (compare(be->s,ls->e->s,sort_type) < 0) { be->next = ls->e; ls->e = be; } /* otherwise insert in the right place */ else { for (e = ls->e ; e->next != (ELEMENT *) NULL ; e = e->next) { if (compare(be->s,e->next->s,sort_type) < 0) break; } be->next = e->next; e->next = be; } be = ne; } } /* returns neg number if s1 < s2, 0 if s1 = s2, pos number if s1 > s2 */ int compare(char *s1,char *s2,int sort_type) { long l1,l2; char *ps1,*ps2; if (sort_type == SORT_ALPHA) return strcmp(s1,s2); /* pure alphabetic comparison */ if (sort_type != SORT_NUMERIC) return 0; /* don't know the sort, claim equal */ l1 = strtol(s1,&ps1,10); l2 = strtol(s2,&ps2,10); if (l1 > l2) return 1; if (l1 < l2) return -1; return strcmp(ps1,ps2); } ELEMENT *alloc_ELEMENT(void) { ELEMENT *e; e = (ELEMENT *) calloc((size_t) 1, sizeof(ELEMENT)); if (e == (ELEMENT *) NULL) { fprintf(stderr," ***ERROR: Out of Memory in alloc_ELEMENT() <%d>\n",__LINE__); exit(EXIT_FAILURE); } e->s = (char *) NULL; e->offset = 0l; e->next = (ELEMENT *) NULL; return e; } void print_list(LIST *l) { ELEMENT *e; for (e = l->e ; e != (ELEMENT *) NULL ; e = e->next) { printf("s=%s offset=%ld\n",e->s,e->offset); } } void print_output_file(FILESORT *fs) { ELEMENT *e; char rdline[MAXCHAR]; int cnt; cnt= 0; for (e = fs->l->e ; e != (ELEMENT *) NULL ; e = e->next) { fseek(fs->in,e->offset,SEEK_SET); fgets(rdline,MAXCHAR-1,fs->in); fputs(rdline,fs->out); if (fs->out != stdout) { cnt++; if (cnt % 200 == 0) { printf("+"); fflush(stdout); } } } if (fs->out != stdout) printf("\n"); if (fs->out != stdout) fclose(fs->out); if (fs->in != stdin) fclose(fs->in); }