/* * Copyright (C) 2002 Jeffrey R. Hoy * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include #include // Parameters typedef /*unsigned __int64*/ int DWORD64; // change this to use 64-bit values const double percentReads = 0.95; // amount of data to cover int cutoff = 50; // max sequence length const int numFiles = 500; // number of files to build at once #define BUFF_SIZE 1500000 // DWORD64s #define HASH_SIZE 1048576 // integers #define LISTBLOCK_SIZE 1048576 // integers const int writeBuffSize = 20000; #define MAX_ASTERIK 0 #define ASTERIK 99999999 // special number to mark '*' // Global variables int minSupport; int minSupportTotal; int* globalBlock; // final stat information int blockSize = 0; int blockAlloc = 0; int blockMax = 10000; // initial num lines in global stat block const int lineLength = cutoff + 2; // cutoff for sequence, one for count, one for length int globalSequenceLength; // used for binary search #define SYNTAX "Syntax:\n mine.exe datafile outfile minSupportIndividual minSupportTotal " int checkAndMerge(int* line1, int* line2, int len); void addBlockData(int length, int count, int* label); class Sequence { private: int sequenceLength; int numSequences; DWORD64* data; char* fileName; public: Sequence(char* fileNameIn); // constructor for length 1 Sequence(Sequence* seqLenN, Sequence* seqLen1, char* fileNameIn); // constructor for length >1 ~Sequence() { if (numSequences > 1 && data != NULL) delete [] data; } int getNumSequences() { return numSequences; } int getSequenceLength() { return sequenceLength; } const DWORD64* getData() { return data; } void addToBlock(); }; // Comparison for size 1 sorting, smaller first int compare1(const void* a, const void* b) { if (*(DWORD64*)a > *(DWORD64*)b) return 1; else if (*(DWORD64*)a == *(DWORD64*)b) return 0; else return -1; } // Comparison for global block sorting, larger first int compare2(const void* a, const void* b) { return ((int*)b)[cutoff] - ((int*)a)[cutoff]; // switch (ret positive) if b is larger } // Comparison for finding most significant loads, larger first int compare3(const void* a, const void* b) { if (*(DWORD64*)a < *(DWORD64*)b) return 1; else if (*(DWORD64*)a == *(DWORD64*)b) return 0; else return -1; } // For binary search on sorted sequences int search1(const void* a, const void* b) { int i; for (i = 0; i < globalSequenceLength; i++) { if ( ((DWORD64*)a)[i] < ((DWORD64*)b)[i] ) return -1; else if ( ((DWORD64*)a)[i] > ((DWORD64*)b)[i] ) return 1; } return 0; } // Builds all sequences of length 1 Sequence :: Sequence(char* fileNameIn) { int index; int intsRead; int listIndex = 0; int numValues = 0; int check; sequenceLength = 1; fileName = fileNameIn; // probabilistic counting: // hash everthing into large buckets, // mark buckets that have more than minimum support // hash again, make list of address that might have min support and counts // sort and pack DWORD64* hashBlock = new DWORD64 [HASH_SIZE]; if (hashBlock == NULL) { cout << "Insufficient Memory for hashBlock"; return; } memset(hashBlock, 0, HASH_SIZE*sizeof(DWORD64)); ifstream* traceFile = new ifstream(fileName, ios::in | ios::binary); if (!traceFile) { cout << "Unable to open trace file" << endl; return; } DWORD64* readBuff = new DWORD64 [BUFF_SIZE]; if (readBuff == NULL) { cout << "Insufficient Memory for readBuff" << endl; return; } while (!traceFile->eof()) { traceFile->read((char*)readBuff, BUFF_SIZE*sizeof(DWORD64)); intsRead = traceFile->gcount() / sizeof(DWORD64); for (index = 0; index < intsRead; index++) { if (readBuff[index] % HASH_SIZE > HASH_SIZE || readBuff[index] % HASH_SIZE < 0) { cout << "Problem with mod and hashing the data:" << endl; cout << *(int*)&readBuff[index] << " " << *(((int*)&readBuff[index])+1)<< endl; cout << index << endl; } else hashBlock[readBuff[index] % HASH_SIZE]++; } } traceFile->close(); traceFile->open(fileName, ios::in | ios::binary); if (!traceFile) { cout << "Unable to open trace file" << endl; return; } DWORD64* listBlock = new DWORD64 [LISTBLOCK_SIZE*2]; if (listBlock == NULL) { cout << "Insufficient Memory for listBlock, change LISTBLOCK_SIZE" << endl; return; } memset(listBlock, 0, LISTBLOCK_SIZE*2*sizeof(DWORD64)); while (!traceFile->eof()) { traceFile->read((char*)readBuff, BUFF_SIZE*sizeof(DWORD64)); intsRead = traceFile->gcount() / sizeof(DWORD64); for (index = 0; index < intsRead; index++) { if (hashBlock[readBuff[index] % HASH_SIZE] >= minSupport) { int found = 0; for (check = 0; check < listIndex; check++) { if (listBlock[check*2] == readBuff[index]) { found = 1; listBlock[check*2+1]++; } } if (!found) { listBlock[listIndex*2] = readBuff[index]; listBlock[listIndex*2+1]++; listIndex++; if (listIndex == 3000) { cout << "3000 sequences length 1 thus far, this might get slow" << endl; } if (listIndex >= LISTBLOCK_SIZE) { cout << "Need more memory for listBlock... Program exit" << endl; } } } } } traceFile->close(); delete [] readBuff; // pack it up numSequences = 0; for (index = 0; index < listIndex; index++) if (listBlock[index*2+1] >= minSupport) numSequences++; data = new DWORD64[numSequences*2*sizeof(DWORD64)]; if (data == NULL) { cout << "Insufficient Memory for data" << endl; return; } memset(data, 0, numSequences*2*sizeof(DWORD64)); // pack it up numSequences = 0; for (index = 0; index < listIndex; index++) { if (listBlock[index*2+1] >= minSupport) { data[numSequences*2] = listBlock[index*2]; data[numSequences*2+1] = listBlock[index*2+1]; numSequences++; } } #if MAX_ASTERIK > 0 // Added code for wildcard data[numSequences*2] = ASTERIK; numSequences++; #endif // sort in increasing order qsort(data, numSequences, sizeof(DWORD64)*2, compare1); cout << "Num potential sequences len1 is " << numSequences << endl; delete [] hashBlock; delete [] listBlock; delete traceFile; } // Builds all sequences of length n+1 Sequence :: Sequence(Sequence* seqLenN, Sequence* seqLen1, char* fileNameIn) { int i,j,k,h; int lenSeqN = seqLenN->getSequenceLength(); int numSeqN = seqLenN->getNumSequences(); int numSeq = 0; const DWORD64* seqLenNData = seqLenN->getData(); sequenceLength = lenSeqN+1; fileName = fileNameIn; // build larger sequences // just get count this time numSeq = 0; for (i = 0; i < numSeqN; i++) { for (j = 0; j < numSeqN; j++) { if (memcmp(&seqLenNData[i*(lenSeqN+1)+1], &seqLenNData[j*(lenSeqN+1)], (lenSeqN-1)*sizeof(DWORD64)) == 0) { // inner match found numSeq++; } } } DWORD64* tempData = new DWORD64[numSeq * (sequenceLength+1)]; if (tempData == NULL) { cout << "Insufficient Memory2" << endl; exit(1); } memset(tempData, 0, sizeof(DWORD64)*numSeq*(sequenceLength+1)); // now actually build numSeq = 0; for (i = 0; i < numSeqN; i++) { for (j = 0; j < numSeqN; j++) { if (memcmp(&seqLenNData[i*(lenSeqN+1)+1], &seqLenNData[j*(lenSeqN+1)], (lenSeqN-1)*sizeof(DWORD64)) == 0) { // inner match found memcpy(&tempData[numSeq*(sequenceLength+1)], &seqLenNData[i*(lenSeqN+1)], lenSeqN*sizeof(DWORD64)); tempData[numSeq*(sequenceLength+1) + (sequenceLength-1)] = seqLenNData[j*(lenSeqN+1)+(lenSeqN-1)]; int astcount = 0; for (k = 0; k < sequenceLength; k++) { if (tempData[numSeq*(sequenceLength+1)+k] == ASTERIK) astcount++; } if (astcount <= MAX_ASTERIK) // accept new sequence numSeq++; } } } cout << "Num potential sequences length "<fail()) { cout << "Unable to open trace file, program exit" << endl; return; } int intsRead = 0; DWORD64* buff = new DWORD64 [BUFF_SIZE]; if (buff == NULL) { cout << "Insufficient Memory for readBuff" << endl; return; } while (!traceFile->eof()) { traceFile->read((char*)(&buff[intsRead]), (BUFF_SIZE-intsRead)*sizeof(DWORD64)); intsRead += traceFile->gcount() / sizeof(DWORD64); // new optimized code to handle wildcard searches globalSequenceLength = sequenceLength; // so search knows number of elements DWORD64* place = NULL; DWORD64 temp1 = 0, temp2 = 0; for (h = 0; h < intsRead-sequenceLength+1; h++) { // using #'s because this is the hotspot // This could be rewitten so handle any number of asteriks, but it would be more complex code // The search space is too large for more than 2 asteriks, so it shouldn't matter #if MAX_ASTERIK >= 3 cout << "Only handling 2 asteriks for now" << endl; #endif #if MAX_ASTERIK >= 2 for (i = 0; i < sequenceLength-1; i++) { temp1 = buff[h + i]; buff[h + i] = ASTERIK; for (j = i+1; j < sequenceLength; j++) { temp2 = buff[h + j]; buff[h + j] = ASTERIK; place = (DWORD64*)bsearch(&buff[h], tempData, numSeq, sizeof(DWORD64)*(sequenceLength+1), search1); if (place != NULL) // found place[sequenceLength]++; buff[h + j] = temp2; } buff[h + i] = temp1; } #endif #if MAX_ASTERIK >= 1 //change so it only looks up if it misses the others? for (i = 0; i < sequenceLength; i++) { temp1 = buff[h + i]; buff[h + i] = ASTERIK; place = (DWORD64*)bsearch(&buff[h], tempData, numSeq, sizeof(DWORD64)*(sequenceLength+1), search1); if (place != NULL) // found place[sequenceLength]++; buff[h + i] = temp1; } #endif // also handle original search place = (DWORD64*)bsearch(&buff[h], tempData, numSeq, sizeof(DWORD64)*(sequenceLength+1), search1); if (place != NULL) // found place[sequenceLength]++; } int intsToCopy = sequenceLength-1; // need to overlap the buffer memcpy(&buff[0], &buff[intsRead-intsToCopy], (intsToCopy)*sizeof(DWORD64)); intsRead = intsToCopy; } traceFile->close(); // pack the data by removing unsupported sequences numSequences = 0; for (i = 0; i < numSeq; i++) { if (tempData[i*(sequenceLength+1) + sequenceLength] >= minSupport) numSequences++; } data = new DWORD64 [numSequences * (sequenceLength+1)]; if (data == NULL) { cout << "Insufficient Memory3" << endl; return; } numSequences = 0; for (i = 0; i < numSeq; i++) { if (tempData[i*(sequenceLength+1) + sequenceLength] >= minSupport) { memcpy(&data[numSequences * (sequenceLength+1)], &tempData[i * (sequenceLength+1)], (sequenceLength+1)*sizeof(DWORD64)); numSequences++; } } delete [] buff; delete [] tempData; delete traceFile; } void Sequence :: addToBlock() { int i,j,k; int found; int label[200] = {0}; int labelCount = 0; for (i = 0; i < numSequences; i++) { // get labels for block line labelCount = 0; for (j = 0; j < sequenceLength; j++) { found = 0; for (k = 0; k < j; k++) { if (data[i*(sequenceLength+1) + j] == data[i*(sequenceLength+1) + k]) { // repeated element found = 1; label[j] = label[k]; break; } } if (!found) { if (data[i*(sequenceLength+1) + j] == ASTERIK) { label[j] = ASTERIK; } else { labelCount++; label[j] = labelCount; } } } addBlockData(sequenceLength, (int)data[i*(sequenceLength+1) + sequenceLength], label); } } // global block update // Global block structure: // blockMax lines each of length linelength // each line has 0 to cutoff labels, count, and length // void addBlockData(int length, int count, int* label) { int i; // search for line // could actually hash this if it is a bottleneck for (i = 0; i < blockSize; i++) { if (memcmp(&globalBlock[i*lineLength], label, cutoff*sizeof(int)) == 0) { globalBlock[i*lineLength + cutoff] += count; return; } } // not found memcpy(&globalBlock[blockSize*lineLength], label, cutoff*sizeof(int)); globalBlock[blockSize*lineLength + cutoff] += count; globalBlock[blockSize*lineLength + cutoff + 1] = length; blockSize++; // check if will need more memory for the block if (blockSize*lineLength >= blockAlloc) { cout << "Global info block full, reallocating..." << endl; blockAlloc = blockAlloc * 4; int* globalBlockNew = new int [blockAlloc]; if (globalBlockNew == NULL) { cout << "Insufficient memory for global information block, program exit" << endl; exit(1); } cout << "Done reallocating1..." << endl; memset(globalBlockNew, 0, blockAlloc*sizeof(int)); cout << "Done reallocating2..." << endl; memcpy(globalBlockNew, globalBlock, blockSize*lineLength*sizeof(int)); cout << "Done reallocating3..." << endl; delete [] globalBlock ; globalBlock = globalBlockNew; cout << "Done reallocating..." << endl; } } // Normalize pattern by starting on 'A' void flatten(int* a, int len) { int i; int newcount = 1; int* change = new int[cutoff]; for (i = 0; i < len; i++) change[i] = -1; for (i = 0; i < len; i++) { if (a[i] != ASTERIK) if (change[a[i]] == -1) { change[a[i]] = newcount; newcount++; } } for (i = 0; i < len; i++) if (a[i] != ASTERIK) { a[i] = change[a[i]]; } delete [] change; } // See if pattern is same thing but shifted, if so merge int checkAndMerge(int* line1, int* line2, int len) { int i,j; int found = 0; int* a = new int[len]; int* b = new int[len]; for (j = 0; j < len; j++) b[j] = line2[j]; for (i = 0; i < len; i++) { for (j = 0; j < len; j++) { a[j] = line1[(i+j) % len]; } flatten(a, len); found = 1; for (j = 0; j < len; j++) if (a[j] != b[j]) found = 0; if (found) { // take max times appearing if (line1[cutoff] > line2[cutoff]) line2[cutoff] = 0; else line1[cutoff] = 0; break; } } delete [] a; delete [] b; return found; } // Formats results to outfile void printBlockData(char* outFileName, char* dataFileName) { cout << "Global block size is " << blockSize << endl; int i,j,k; int templine[1000] = {0}; // combine patterns that are really similiar, like AAAB and AABA and ABAA and ABBB for (i = 1; i <= cutoff; i++) { for (j = 0; j < blockSize-1; j++) { if (globalBlock[j*lineLength + cutoff + 1] == i && globalBlock[j*lineLength + cutoff] > 0) for (k = j+1; k < blockSize; k++) { // check that length is good if (globalBlock[k*lineLength + cutoff + 1] == i && globalBlock[k*lineLength + cutoff] > 0) checkAndMerge(&globalBlock[j*lineLength], &globalBlock[k*lineLength], i); } } } // sorts the whole thing, ignoring length at this point qsort(globalBlock, blockSize, lineLength*sizeof(int), compare2); ofstream outFile; outFile.open(outFileName, ios::out | ios::trunc); if (outFile.fail()) { cout << "Can't open output file: " << outFileName << endl; return; } outFile << "Data File Name: " << dataFileName << endl; outFile << "Minimum support for each load: " << minSupport << endl; outFile << "Minimum support over all loads: " << minSupportTotal << endl; for (i = 1; i <= cutoff; i++) { outFile << "Sequences Length " << i << endl; for (j = 0; j < blockSize; j++) { if (globalBlock[j*lineLength + cutoff + 1] == i && globalBlock[j*lineLength + cutoff] >= minSupportTotal) { outFile << " Count: " << globalBlock[j*lineLength + cutoff] << " Sequence:"; for (k = 0; k < i; k++) { if (globalBlock[j*lineLength + k] == ASTERIK) outFile << " " << '*'; else outFile << " " << (char)((globalBlock[j*lineLength + k]-1) + 'A'); } outFile << endl; } } } outFile.close(); } // Analyzes a single load file, building sequences from 1 to cutoff void Start(char* fileName) { Sequence* seqLenN = new Sequence(fileName); seqLenN->addToBlock(); while (seqLenN->getNumSequences() > 0 && seqLenN->getSequenceLength() < cutoff) { Sequence* seqLenNPlus1 = new Sequence(seqLenN, seqLenN, fileName); delete seqLenN; seqLenN = seqLenNPlus1; seqLenN->addToBlock(); } delete seqLenN; } void separateLoads(char* dataFileName, char* outFileName) { cout << "Execution Beginning" << endl; cout << "Max Sequence Length: " << cutoff << endl; int totalLoads = 0; int temp = 0; DWORD64 totalReads = 0; DWORD64 countReads = 0; int readsUsed = -1; int i; ifstream* traceFile = new ifstream(dataFileName, ios::in | ios::binary); if (!traceFile) { cout << "Unable to open trace file, program exit" << endl; return; } // // Obtain numbers of most significant loads // Variable loadGroup has an int for each load. It stores the new file number if the // load is significant. Otherwise it stores 0. // traceFile->read((char*)&totalLoads, sizeof(int)); cout << (int)totalLoads << " Total Loads" << endl; DWORD64* loadNums = new DWORD64[(int)totalLoads*2]; if (loadNums == NULL) cout << "Not enough memory for loadNums" << endl; memset(loadNums, 0, sizeof(DWORD64)*(int)totalLoads*2); int* readBuff = new int [BUFF_SIZE*2]; if (readBuff == NULL) cout << "Not enough memory for readBuff" << endl; while (!traceFile->eof()) { traceFile->read((char*)readBuff, BUFF_SIZE*2*sizeof(int)); int valsRead = traceFile->gcount() / sizeof(DWORD64); for (i = 0; i < valsRead; i+=2) loadNums[readBuff[i]*2]++; } traceFile->close(); for (i = 0; i < totalLoads; i++) { loadNums[i*2+1] = i; } // sort, larger in front qsort(loadNums, (int)totalLoads, sizeof(DWORD64)*2, compare3); for (i = 0; i < totalLoads; i++) totalReads+=loadNums[i*2]; for (i = 0; i < totalLoads; i++) { countReads+=loadNums[i*2]; if (((double)(int)countReads) / ((double)(int)totalReads) > percentReads) { readsUsed = i+1; break; } } cout << readsUsed << " Loads Used" << endl; if (readsUsed == -1) { cout << "Bad number of loads. Perhaps file does not exist? Program exit." << endl; exit(1); } DWORD64* loadGroup = new DWORD64 [(int)totalLoads]; if (loadGroup == NULL) cout << "Not enough memory for loadGroup" << endl; memset(loadGroup, 0, sizeof(DWORD64)*(int)totalLoads); for (i = 0; i < readsUsed; i++) loadGroup[loadNums[i*2+1]] = i+1; // assign group numbers to loads that will be used delete [] loadNums; // // Build a file for each of the significant loads // and analyze the values from that load // int* count = new int[numFiles]; if (count == NULL) cout << "Not enough memory for count" << endl; memset(count, 0, sizeof(int)*numFiles); DWORD64* buff = new DWORD64[numFiles*writeBuffSize]; if (buff == NULL) { cout << "Not enough memory for buff" << endl; return; } memset(buff, 0, sizeof(DWORD64)*numFiles*writeBuffSize); int base = 0; while (base < readsUsed) { // take it a group of files at a time memset(buff, 0, sizeof(DWORD64)*numFiles*writeBuffSize); memset(count, 0, sizeof(int)*numFiles); int max = base + numFiles; if (readsUsed < max) max = readsUsed; cout << "Building loads " << base << " to " << max-1 << endl; ofstream* tempFile = new ofstream[numFiles]; char filename[100] = {0}; for (i = 0; i < max-base; i++) { sprintf(filename, "load%04d.data", i); tempFile[i].open(filename, ios::out | ios::binary); } DWORD64 group; DWORD64 mgroup; traceFile->open(dataFileName, ios::in | ios::binary); traceFile->read((char*)&temp, sizeof(int)); while (!traceFile->eof()) { traceFile->read((char*)readBuff, BUFF_SIZE*2*sizeof(int)); int valsRead = traceFile->gcount() / sizeof(int); for (i = 0; i < valsRead; i+=2) { if (loadGroup[readBuff[i]] > 0) { // used in file group = loadGroup[readBuff[i]]-1; // get load number if (base <= group && group < max) { // if file is being built mgroup = group-base; buff[mgroup*writeBuffSize + count[mgroup]] = readBuff[i+1]; count[mgroup]++; if (count[mgroup] == writeBuffSize) { tempFile[mgroup].write((char*)&buff[mgroup*writeBuffSize], sizeof(DWORD64)*count[mgroup]); count[mgroup]=0; } } } } } traceFile->close(); for (i = 0; i < max-base; i++) { tempFile[i].write((char*)&buff[i*writeBuffSize], sizeof(DWORD64)*count[i]); tempFile[i].close(); } for (i = 0; i < max-base; i++) { sprintf(filename, "load%04d.data", i); Start(filename); cout << "Analysis of Load " << i+base << " completed" << endl; } delete [] tempFile; base += numFiles; } delete [] loadGroup; delete [] readBuff; delete [] buff; delete [] count; delete traceFile; cout << "Printing block data to " << outFileName << " ..." << endl; printBlockData(outFileName, dataFileName); cout << "Execution Complete" << endl; return; } int main(int argc, char* argv[]) { if (argc != 5) { cout << SYNTAX << endl; return 1; } char* dataFileName = argv[1]; char* outFileName = argv[2]; minSupport = atoi(argv[3]); minSupportTotal = atoi(argv[4]); if (minSupport > minSupportTotal) { cout << SYNTAX << endl; return 1; } blockAlloc = blockMax * lineLength; globalBlock = new int [blockAlloc]; if (globalBlock == NULL) { cout << "Insufficient memory for global information block" << endl; return 1; } memset(globalBlock, 0, blockAlloc*sizeof(int)); separateLoads(dataFileName, outFileName); delete [] globalBlock; return 0; }