/***************************************/ /* Weeder 1.4 see LICENSE.txt file for terms and conditions of use */ #define MAXSEQ 100000 #define CPS CLOCKS_PER_SEC #include #include #include #include #include #include #include #include #include #include #include char **sequence; char **name; int *lens; int winlen = 8; int MINLEN = 8; int MAXGAP = 10000; int *nuclei; int *revnuclei; int gloplen = 0; char alphabet[4] = { 'A', 'C', 'G', 'T' }; int totseq = 0; int uu, pp, qq; int MAXERR = 0; FILE *F; int reversed = 0; struct pattern { int shadowed; int rank; int overlap; int hamming; int inside; int outside; int hor; int error; int vert; double max; double min; int **matrix; float score; unsigned short int counter; unsigned short int inshuffled; char *pat; struct pattern *next; struct brother *nextbro; }; struct brother { char pat[14]; struct brother *nextbro; }; struct pattern *headpat[4]; struct pattern *pointer[4]; struct pattern *endpat; struct pattern *seekpat; struct brother *newbro; int chartoint (char car); void insert (int start, int end, int seqno); struct pattern *mergesrt (struct pattern *primo, int quanti); struct pattern *merge (struct pattern *uno, struct pattern *due); char revcomp (char c); char *revpattern (char *a); FILE *OF; FILE *HF; int adviceflag = 1; main (int argc, char *argv[]) { int threshold = 85; double percentage; char locatepattern[50]; int maah; int rankcount; int flagjump; int flagone = 0; int pid; char urlname[200] = ""; char commandline[200] = ""; FILE *L; char **color; double newmin, newmax; int **newmatrix; int **matrix; double max; double min; double mintmp, maxtmp; double tmpflo; struct pattern *newpat; double expected; double revmy; int jj; int partial; int bestpos; int uno, due; int i, j, k, l; int ii, flagnope; int nn; int howmany = 0; int howmanynew = 0; int whereavg = 0; int lastinseq = -1; int n; int nchar; char buffer[MAXSEQ]; char infile[400] = "stdin"; char patfile[400] = ""; int len; int perint; double thr; struct pattern *tmppat; char entry[4][6]; int partiallen = 0; int patlimit = 4; int maxflag = 0, repflag = 0, errflag = 0, minflag = 0; FILE *G; char outfile[200] = ""; char browsefile[200] = ""; pid = getpid (); color = (char **) malloc (4 * sizeof (char *)); for (i = 0; i < 4; i++) bzero (entry[i], 6); for (i = 0; i < 4; i++) color[i] = (char *) malloc (10); strcpy (color[2], "gold"); strcpy (color[3], "red"); strcpy (color[1], "blue"); strcpy (color[0], "green"); i = 1; lens = (int *) malloc ((MAXSEQ*2) * sizeof (int)); /* Conteggio lunghezza sequenze */ if(argc < 4) { fprintf (stderr, "\nUsage: locator.out inputfile oligo substitutions [threshold] [S]\n", infile); exit (0); } sprintf (infile, "%s", argv[1]); sprintf (locatepattern, "%s", argv[2]); MAXERR = atoi(argv[3]); if ((argc > 4) && (strcmp (argv[4], "S") == 0)) reversed = 1; if ((argc > 5) && (strcmp (argv[5], "S") == 0)) reversed = 1; if ((argc > 5) && (strcmp (argv[5], "S") != 0)) threshold = atoi(argv[5]); if ((argc > 4) && (strcmp (argv[4], "S") != 0)) threshold = atoi(argv[4]); if(threshold == 100) threshold = 99; if((threshold < 0)||(threshold > 100)) { fprintf (stderr, "\nPercentage threshold should be between 1 and 100\n"); exit (0); } G = fopen (infile, "r"); if (G == NULL) { fprintf (stderr, "\nError in opening file %s\n", infile); exit (0); } partiallen = 0; l = 0; nchar = 0; totseq = 0; bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); if (buffer[0] != '>') { fprintf (stderr, "\nInput file not in FASTA format\nExiting program\n"); exit (0); } matrix = (int **) malloc (strlen(locatepattern) * sizeof (int *)); for (j = 0; j < strlen(locatepattern); j++) matrix[j] = (int *) malloc (4 * sizeof (int)); for (j = 0; j < strlen(locatepattern); j++) for(i = 0; i < 4; i++) matrix[j][i] = 0; while (nchar != EOF) { totseq++; nchar = fscanf (G, "\n%[^\n]", buffer); while ((buffer[0] != '>') && (nchar != EOF)) { partiallen += strlen (buffer); bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); } lens[l++] = partiallen; partiallen = 0; } /* Fine conteggio lunghezza sequenze */ fclose (G); sprintf (patfile, "%s.mix", infile); sprintf (browsefile, "%s.%s.html", infile, locatepattern); sprintf (outfile, "%s.%s.wee", infile,locatepattern); fprintf (stderr, "\nRunning locator on file %s\nWriting output on files %s and %s\n", infile, outfile, browsefile); G = fopen (infile, "r"); if (reversed == 0) sequence = (char **) malloc ((totseq + 1) * sizeof (char *)); else sequence = (char **) malloc ((2 * totseq + 1) * sizeof (char *)); if (reversed == 0) name = (char **) malloc ((totseq + 1) * sizeof (char *)); else name = (char **) malloc ((2 * totseq + 1) * sizeof (char *)); for (i = 0; i < totseq + 1; i++) { name[i] = (char *) malloc ((80) * (sizeof (char))); if(reversed == 0) sequence[i] = (char *) malloc ((lens[i] + 1 + winlen) * (sizeof (char))); else sequence[i] = (char *) malloc (2 * (lens[i] + 1 + winlen) * (sizeof (char))); } if (reversed == 1) for (i = 0; i < totseq; i++) sequence[i + totseq] = (char *) malloc ((lens[i] + 1 + winlen) * sizeof (char)); if (reversed == 1) for (i = 0; i < totseq; i++) name[i + totseq] = (char *) malloc ((80) * (sizeof (char))); bzero (buffer, MAXSEQ); l = 0; fscanf (G, "\n%[^\n]", sequence[l]); ERE: if (sequence[l][0] == '>') { strncpy (name[l], sequence[l], 39); if (reversed == 1) { strcpy (name[l + totseq], "REVCOMP "); strcat (name[l + totseq], name[l]); } bzero (sequence[l], lens[l]); bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); while ((buffer[0] != '>') && (nchar != EOF)) { strcat (sequence[l], buffer); bzero (buffer, MAXSEQ); nchar = fscanf (G, "\n%[^\n]", buffer); } } bzero (sequence[l + 1], lens[l + 1]); if (buffer[0] == '>') strcat (sequence[l + 1], buffer); len = strlen (sequence[l]); for (i = 0; i < lens[l]; i++) { sequence[l][i] = tolower (sequence[l][i]); } lens[l] = strlen (sequence[l]); l++; if (l < totseq) goto ERE; if (reversed == 1) { for (l = 0; l < totseq; l++) { for (i = lens[l] - 1; i >= 0; i--) { sequence[l + totseq][lens[l] - i - 1] = revcomp (sequence[l][i]); sequence[l][lens[l]+lens[l]-i-1] = revcomp (sequence[l][i]); } lens[l + totseq] = lens[l]; } //totseq = totseq * 2; } fclose (G); endpat = (struct pattern *) malloc (sizeof (struct pattern)); endpat->score = -111111.0; OF = fopen (outfile, "w"); HF = fopen (browsefile, "w"); fprintf (OF, "\n\n**** LOCATING OLIGO : %s ****\n\n", locatepattern); fprintf (HF, ""); fprintf (HF, "

**** LOCATING OLIGO : %s ****

", locatepattern); /****************************************************************/ /* INIZIO RICERCA ESATTA */ /****************************************************************/ winlen = strlen(locatepattern); nuclei = (int *) malloc (winlen * (sizeof (int))); revnuclei = (int *) malloc (winlen * (sizeof (int))); for (uu = 0; uu < totseq; uu++) { bestpos = -1; if(reversed == 0) maah = (lens[uu]) - strlen (locatepattern) + 1; else maah = (2 * lens[uu]) - strlen (locatepattern) + 1; for (j = 0; j < maah; j++) { k = 0; flagnope = 0; for (pp = 0; pp < strlen (locatepattern); pp++) if (chartoint (tolower (sequence[uu][j + pp])) < 0) flagnope = 1; if (flagnope == 0) { for (pp = 0; pp < strlen (locatepattern); pp++) { nuclei[pp] = chartoint (tolower (sequence[uu][j + pp])); if (tolower (sequence[uu][j + pp]) != tolower (locatepattern[pp])) { k++; } } } else k = MAXERR + 1; if ((k <= MAXERR) &&((reversed == 0)||((j < lens[uu]- strlen (locatepattern)+1)||(j >= lens[uu])))) { howmany++; nn = strlen (locatepattern) - 1; pp = 0; for (jj = 0; jj <= nn; jj++) pp += nuclei[nn - jj] * (int) pow (4.0, jj); for (jj = 0; jj <= nn; jj++) { matrix[jj][nuclei[jj]]++; } if (lastinseq != uu) { lastinseq = uu; bestpos = j; } k = 0; } } } for (pp = 0; pp < strlen (locatepattern); pp++) { maxtmp = 0; mintmp = 1; for (qq = 0; qq < 4; qq++) { if (((double) matrix[pp][qq] / howmany) > maxtmp) maxtmp = (double) matrix[pp][qq] / howmany; if (((double) matrix[pp][qq] / howmany) < mintmp) mintmp = (double) matrix[pp][qq] / howmany; } min += log (mintmp + 0.001); max += log (maxtmp); } // QUI SCRIVE //bzero (logofile, 200); //sprintf (logofile, "%s.%d.%d.logo", infile, i, //seekpat->next->rank); //L = fopen (logofile, "a"); newmatrix = (int **) malloc (strlen(locatepattern) * sizeof (int *)); for (j = 0; j < strlen(locatepattern); j++) newmatrix[j] = (int *) malloc (4 * sizeof (int)); fprintf (OF, "\n\n%s with %d substitutions and %d percent threshold\n", locatepattern, MAXERR, threshold); if (reversed == 1) fprintf (OF, "[%s]\n ", revpattern (locatepattern)); fprintf (HF, ""); fprintf (HF, "

%s %d substitutions and %d percent threshold
", locatepattern, MAXERR, threshold); if (reversed == 1) fprintf (HF, "[%s]
", revpattern (locatepattern)); fprintf (HF, "
"); fprintf (OF, "\n"); fprintf (HF, "
"); if(howmany == 0) { fprintf (OF, "\nPattern not found\n"); fprintf (HF, "
Pattern not found
"); fclose(OF); fclose(HF); exit(0); } fprintf (OF, "\nBest occurrences (match percentage): \n"); fprintf (HF, "
Best occurrences (match percentage):
"); /* if (i == 0) MAXERR = 1; else MAXERR = i + 1; */ // seekpat->next = bestp[i]; for (uu = 0; uu < strlen(locatepattern); uu++) for (j = 0; j < 4; j++) newmatrix[uu][j] = 0; howmanynew = 0; lastinseq = -1; //seekpat->next->inshuffled = 0; expected = 0; //appast = (char *) malloc (strlen (locatepattern)); for (uu = 0; uu < totseq; uu++) { fprintf (OF, "\n%s\n", name[uu]); fprintf(HF,"
%10s
", name[uu]); bestpos = -1; //bzero (appast, strlen (locatepattern)); if(reversed == 0) maah = (lens[uu]) - strlen (locatepattern) + 1; else maah = (2*lens[uu]) - strlen (locatepattern) + 1; for (j = 0; j < maah; j++) { k = 0; flagnope = 0; for (pp = 0; pp < strlen (locatepattern); pp++) if (chartoint (tolower (sequence[uu][j + pp])) < 0) flagnope = 1; if (flagnope == 0) { for (pp = 0; pp < strlen (locatepattern); pp++) { nuclei[pp] = chartoint (tolower (sequence[uu][j + pp])); } } else k = MAXERR + 1; if ((k <= MAXERR) &&((reversed == 0)||((j < lens[uu]- strlen (locatepattern)+1)||(j >= lens[uu])))) { nn = strlen (locatepattern) - 1; tmpflo = 0; for (jj = 0; jj <= nn; jj++) { tmpflo += log ((double) matrix[jj][nuclei[jj]]) - log ((double) howmany); //fprintf(OF, "%c", alphabet[nuclei[jj]]); } percentage = (100.0 * (tmpflo - min) / (max - min)); if (percentage >= threshold) { howmanynew++; //fprintf (L, "%s\n", name[uu]); //fprintf (HF, "
%.20s --- ", name[uu]); if(reversed == 0) { fprintf (OF, "+ "); fprintf (HF, "+ "); } else { if(j < lens[uu]) { fprintf (OF, "+ "); fprintf (HF, "+ "); } else { fprintf (OF, "- "); fprintf (HF, "- "); } } if (percentage < 90) { fprintf (OF, "["); fprintf (HF, "["); } else { fprintf (OF, " "); fprintf (HF, "."); } for (jj = 0; jj <= nn; jj++) { fprintf (OF, "%c", alphabet[nuclei[jj]]); //fprintf (L, "%c", alphabet[nuclei[jj]]); fprintf (HF, "%c", color[nuclei[jj]], alphabet[nuclei[jj]]); if (percentage >= 90) newmatrix[jj][nuclei[jj]]++; } if (percentage < 90) { fprintf (OF, "]"); fprintf (HF, "]"); } else { fprintf (OF, " "); fprintf (HF, "."); } if ((reversed == 1)) { fprintf (OF, "\n"); fprintf (HF, "
"); if (percentage < 90) { fprintf (OF, " ["); fprintf (HF, " ["); } else { fprintf (OF, " "); fprintf (HF, " ."); } for (jj = nn; jj >= 0; jj--) { fprintf (OF, "%c", revcomp (alphabet [nuclei[jj]])); //fprintf (L, "%c", alphabet[nuclei[jj]]); fprintf (HF, "%c", revcomp (alphabet [nuclei[jj]])); } if (percentage < 90) { fprintf (OF, "]"); fprintf (HF, "]"); } else { fprintf (OF, " "); fprintf (HF, "."); } if(j >= lens[uu]) { fprintf (OF, " position %d,", 2*lens[uu] - j - strlen (locatepattern) + 1); fprintf (HF, " ---
position %d,", 2*lens[uu] - j - strlen (locatepattern) + 1); } else { fprintf (OF, " position %d,", j + 1); fprintf (HF, " --- position %d,", j + 1); } } else { fprintf (OF, " position %d,", j + 1); fprintf (HF, " --- position %d,", j + 1); } fprintf (OF, " (%.2f) ", percentage); fprintf (HF, " (%.2f) ", percentage); fprintf (OF, "\n"); //fprintf (L, "\n"); fprintf (HF, "
"); } // fprintf(OF, " %f (%d) ", tmpflo, k); //fprintf(OF, " %f ", 100.0*(tmpflo - seekpat->next->min)/(seekpat->next->max - seekpat->next->min)); k = 0; } } } newmin = 0; newmax = 0; fprintf (OF, "\n\t Frequency Matrix \n"); fprintf (OF, "\t All Occurrences \t\t Best Occurrences \n"); fprintf (OF, "\t A C G T\t\t A C G T\n"); fprintf (HF, "
"); fprintf (HF, ""); fprintf (HF, "
Frequency Matrix
All Occs Best Occs"); fprintf (HF, "
ACGT ACGT", color[0], color[1], color[2], color[3], color[0], color[1], color[2], color[3]); for (pp = 0; pp < strlen (locatepattern); pp++) { sprintf (entry[0], "%d", matrix[pp][0]); sprintf (entry[1], "%d", matrix[pp][1]); sprintf (entry[2], "%d", matrix[pp][2]); sprintf (entry[3], "%d", matrix[pp][3]); fprintf (OF, "%d \t%5d %5d %5d %5d\t", pp + 1, matrix[pp][0], matrix[pp][1], matrix[pp][2], matrix[pp][3]); fprintf (OF, "\t%5d %5d %5d %5d\n", newmatrix[pp][0], newmatrix[pp][1], newmatrix[pp][2], newmatrix[pp][3]); fprintf (HF, "
%d \t%d %d %d %d", pp + 1, matrix[pp][0], matrix[pp][1], matrix[pp][2], matrix[pp][3]); fprintf (HF, "|\t%d %d %d %d", newmatrix[pp][0], newmatrix[pp][1], newmatrix[pp][2], newmatrix[pp][3]); } fprintf (HF, "

"); fprintf (OF, "\n==========================================\n"); fprintf (HF, "


"); fclose (OF); fprintf (HF, ""); fclose (HF); exit (0); } int chartoint (char car) { int ret; switch (car) { case 'a': return 0; break; case 'c': return 1; break; case 'g': return 2; break; case 't': return 3; break; case '$': return 4; break; default: return -1; break; } } struct pattern * mergesrt (struct pattern *primo, int quanti) { int i, j, k; int first, second; double mid; struct pattern *l1, *l2; struct pattern *ans; if (quanti == 1) return primo; else { l1 = primo; l2 = primo; mid = (double) (quanti) / 2; if (mid > floor (mid)) { mid = mid - .5; first = (int) mid; second = first + 1; } else { first = (int) mid; second = (int) mid; } for (i = 0; i < first - 1; i++) l2 = l2->next; ans = l2; l2 = l2->next; ans->next = endpat; l1 = mergesrt (l1, first); l2 = mergesrt (l2, second); ans = merge (l1, l2); return ans; } } struct pattern * merge (struct pattern *uno, struct pattern *due) { struct pattern *ret, *first; int flag = 0; if ((uno->score > due->score) || ((uno->score == due->score) && (uno->counter >= due->counter))) { first = uno; uno = uno->next; ret = first; ret->next = endpat; } else { first = due; due = due->next; ret = first; ret->next = endpat; } while (flag == 0) { if ((uno->score > due->score) || ((uno->score == due->score) && (uno->counter >= due->counter))) { if (uno->score == -111111) flag = 1; else { ret->next = uno; uno = uno->next; ret = ret->next; ret->next = endpat; } } else { ret->next = due; due = due->next; ret = ret->next; ret->next = endpat; } } return first; } char revcomp (char c) { char ret; switch (tolower (c)) { case 'a': ret = 'T'; break; case 'c': ret = 'G'; break; case 'g': ret = 'C'; break; case 't': ret = 'A'; break; default: ret = toupper (c); //fprintf(stderr,"\nLe sequenze di input contengono una qualche cazzata : %c\n", c); //exit(2); break; } return ret; } int overlap (char *a, char *b, int checkrev) { int l; int i, j, k; int len; int ans1 = 0; int ans2 = 0; int flag = 1; int toret = 0; int ansrev; char *p; len = strlen (a); flag = 1; for (l = 1; l < 3; l++) { for (i = 0; i < len - l; i++) { if (a[i + l] != b[i]) { flag = 0; ans1 = 0; break; } } if (flag == 1) toret = 1; flag = 1; for (i = 0; i < len - l; i++) { if (a[i] != b[i + l]) { flag = 0; ans2 = 0; break; } } if (flag == 1) toret = 1; } if (checkrev == 1) { p = revpattern (a); ansrev = overlap (p, b, 0); if (ansrev == 1) toret = 1; } return toret; } int hamming (char *a, char *b, int checkrev) { int h = 0, h1 = 9999; int i; int minl; char *p, *q; if (strlen (a) != strlen (b)) return 9999; if (strcmp (a, b) == 0) return 9999; for (i = 0; i < strlen (a); i++) if (a[i] != b[i]) h++; if (checkrev == 0) return h; if (checkrev == 1) { p = revpattern (a); h1 = 0; for (i = 0; i < strlen (a); i++) if (a[i] != b[i]) h1++; } if (h1 < h) h = h1; return h; } int inside (char *a, char *b, int checkrev) { int len1, len2; int i, j, k, match; int ans = 0, ansrev = 0; char *c; len1 = strlen (a); len2 = strlen (b); if (((len2 - len1) != 2)) return 0; for (i = 0; i < len2 - len1 + 1; i++) { match = 1; for (j = i; j < i + len1; j++) { if (a[j - i] != b[j]) { match = 0; break; } } if (match == 1) ans = 1; } if (checkrev == 1) { c = revpattern (a); ansrev = inside (c, b, 0); if (ansrev == 1) ans = 1; } return ans; } char * revpattern (char *a) { int len, i, j; char *toret; len = strlen (a); toret = (char *) malloc ((len + 1) * sizeof (char)); bzero (toret, len + 1); for (i = 0; i < len; i++) toret[i] = toupper (revcomp (a[len - i - 1])); return (toret); }