// Programmer: Craig Stuart Sapp // Note: Adapted from the Humdrum Toolkit program "simil": // simil.c 1.3 92/04/21 by Keith Orpen, University of // Waterloo, Ontario. // Creation Date: Tue Nov 17 14:35:26 PST 2009 // Last Modified: Tue Dec 8 20:06:25 PST 2009 // Filename: ...sig/examples/all/simil.cpp // Web Address: http://sig.sapp.org/examples/museinfo/humdrum/simil.cpp // Syntax: C++; museinfo // // Description: Measure Damerau-Levenshtein edit distance between a Humdrum // source file and template file. // #include "humdrum.h" #include "PerlRegularExpression.h" #include #include #include #ifndef OLDCPP #include #include #else #include #include #endif // function declarations: void checkOptions (Options& opts, int argc, char** argv); void example (void); void usage (const char* command); void readTemplateContents (Array >& storage, const char* filename); void readTemplateContents (Array >& storage, istream& filename); void printTemplate (Array >& storage); int chooseSpine (const char* interp, HumdrumFile& infile); void fillSourceData (Array >& sourcedata, Array& datalines, HumdrumFile& infile, int spine, int nulltest); void usual_thing (Array& results, Array >& sourcedata, Array >& templatedata); void unusual_thing (Array >& results, Array >& sourcedata, Array >& templatedata, int sublen); void printResults (Array& results, Array >& sourcedata, Array >& templatedata); void printResultsSubString (Array >& results, Array >& sourcedata, Array >& templatedata); double dlv (Array >& s1, int offset1, Array >& s2, int offset2); void doDamerauLevenshteinAnalysis(Array >& sourcedata, Array >& templatedata); double dlvcharstar (char** set1, int len1, char** set2, int len2); double dlvstring (Array >& set1, int offset1, Array >& set2, int offset2, int temlen); double dlvint (int *s1, int len1, int *s2, int len2); double dlv (Array >& set1, int offset1, Array >& set2, int offset2); int getMinIndex (Array& list); void printSubStringInfo (Array& list, double target); void printWeights (void); void readEditWeights (const char* filename); void printSequence (Array >& sourcedata, int index, int size, int flag); // User interface variables: Options options; int debugQ = 0; // used with --debug option int rawQ = 0; // used with -R option int xlen = 0; // used with -x option int reverseQ = 0; // used with -r option int scalingQ = 0; // used with -n option const char* interp = ""; // used with -i option int pweightQ = 0; // used with -W option int nullQ = 0; // used with -N option int appendQ = 0; // used with -a option (not implemented) int prependQ = 0; // used with -p option (not implemented) int sequenceQ = 0; // used with -s option int spacesQ = 1; // used with -S option double threshold = 0.0; // used with -t option double weight_R1 = 1.0; // --R1: deleting a repeated element of S1 double weight_R2 = 1.0; // --R2: deleting a repeated element of S2 double weight_D1 = 1.0; // --R1: deleting a non-repeated element of S1 double weight_D2 = 1.0; // --R1: deleting a non-repeated element of S2 double weight_S0 = 1.0; // --S0: sub by a non-repeated element double weight_S1 = 1.0; // --S1: sub by an element repeated in S1 double weight_S2 = 1.0; // --S2: sub by an element repeated in S2 double weight_S3 = 1.0; // --S3: sub by an element repeated in S1 and S2 ////////////////////////////////////////////////////////////////////////// int main(int argc, char** argv) { // process the command-line options checkOptions(options, argc, argv); HumdrumFile infile; Array > templatedata; Array > sourcedata; Array datalines; if ((options.getArgCount() < 1) || (options.getArgCount() > 2)) { usage(options.getCommand().data()); exit(1); } int sourcearg = 1; int templatearg = 2; if (reverseQ) { sourcearg = 2; templatearg = 1; } if (options.getArgCount() == 2) { infile.read(options.getArg(sourcearg)); readTemplateContents(templatedata, options.getArg(templatearg).data()); } else { // read second file from standard input if (sourcearg == 1) { infile.read(options.getArg(1)); readTemplateContents(templatedata, cin); } else { infile.read(cin); readTemplateContents(templatedata, options.getArg(1).data()); } } int spine = chooseSpine(interp, infile); fillSourceData(sourcedata, datalines, infile, spine, nullQ); if (debugQ) { cout << "SOURCE DATA: " << endl; printTemplate(sourcedata); cout << "TEMPLATE DATA: " << endl; printTemplate(templatedata); } doDamerauLevenshteinAnalysis(sourcedata, templatedata); if (pweightQ) { printWeights(); } return 0; } ////////////////////////////////////////////////////////////////////////// ////////////////////////////// // // fillSourceData -- extract tokens from source Humdrum file for use // in analysis. // void fillSourceData(Array >& sourcedata, Array& datalines, HumdrumFile& infile, int spine, int nulltest) { sourcedata.setSize(infile.getNumLines()); sourcedata.setSize(0); datalines.setSize(infile.getNumLines()); datalines.setSize(0); int strsize; int i, j; int index; for (i=0; i >& storage) { int i; for (i=0; i storage; char buffer[1024] = {0}; int strsize; weightfile.getline(buffer, 1024); while (!weightfile.eof()) { strsize = strlen(buffer); storage.setSize(strsize+1); strcpy(storage.getBase(), buffer); weightfile.getline(buffer, 1024); pre.sar(storage, "\\s*#.*", "", ""); if (pre.search(storage, "^\\s*(d1|d2|r1|r2|s0|s1|s2|s3)\\s+([\\d.+\\-]+)", "i")) { if (pre2.search(pre.getSubmatch(1), "D1", "i")) { weight_D1 = atof(pre.getSubmatch(2)); } else if (pre2.search(pre.getSubmatch(1), "D2", "i")) { weight_D2 = atof(pre.getSubmatch(2)); } else if (pre2.search(pre.getSubmatch(1), "R1", "i")) { weight_R1 = atof(pre.getSubmatch(2)); } else if (pre2.search(pre.getSubmatch(1), "R2", "i")) { weight_R2 = atof(pre.getSubmatch(2)); } else if (pre2.search(pre.getSubmatch(1), "S0", "i")) { weight_S0 = atof(pre.getSubmatch(2)); } else if (pre2.search(pre.getSubmatch(1), "S1", "i")) { weight_S1 = atof(pre.getSubmatch(2)); } else if (pre2.search(pre.getSubmatch(1), "S2", "i")) { weight_S2 = atof(pre.getSubmatch(2)); } else if (pre2.search(pre.getSubmatch(1), "S3", "i")) { weight_S3 = atof(pre.getSubmatch(2)); } } } weightfile.close(); } ////////////////////////////// // // readTemplateContents -- read the contents of the template file // void readTemplateContents(Array >& storage, const char* filename) { #ifndef OLDCPP ifstream templatefile(filename, ios::in); #else ifstream templatefile(filename, ios::in | ios::nocreate); #endif readTemplateContents(storage, templatefile); templatefile.close(); } void readTemplateContents(Array >& storage, istream& instream) { if (instream.eof()) { cerr << "Error opening template file" << endl; exit(1); } storage.setSize(1000); storage.setSize(0); storage.setGrowth(100000); char buffer[1024] = {0}; int index; int strsize; instream.getline(buffer, 1024); while (!instream.eof()) { strsize = strlen(buffer); storage.setSize(storage.getSize()+1); index = storage.getSize() - 1; storage[index].setSize(strsize+1); strcpy(storage[index].getBase(), buffer); instream.getline(buffer, 1024); } storage.allowGrowth(0); } ////////////////////////////// // // checkOptions -- // void checkOptions(Options& opts, int argc, char* argv[]) { options.define("n|no-normalize=b", "do not normalize similarity measures"); options.define("r|reverse=b", "reverse source and template arguments"); options.define("R|raw=b", "don't normalize the DLV values"); options.define("i|interp=s", "use first spine in excl. interp."); options.define("x|length=i:0", "subordinate pattern length"); options.define("W|print-weights=b", "print edit operation scores"); options.define("w|weight-file=s", "load editing weights from file"); options.define("N|exclude-null-tokens=b", "do not consider null tokens"); options.define("a|append=b", "append analysis to input data"); options.define("p|prepend=b", "prepend analysis to input data"); options.define("debug=b", "print debugging information"); options.define("s|sequence=b", "print search sequences"); options.define("S|no-spaces=b", "print search sequences without spaces"); options.define("t|threshold=d:0.0", "similarity threshold for output"); options.define("R1|r1=d:1.0", "scr for deleting a repeated element of S1"); options.define("R2|r2=d:1.0", "scr for deleting a repeated element of S2"); options.define("D1|d1=d:1.0", "scr for deleting non-repeated element of S1"); options.define("D2|d2=d:1.0", "scr for deleting non-repeated element of S2"); options.define("S0|s0=d:1.0", "scr for sub by a non-repeated element"); options.define("S1|s1=d:1.0", "scr for sub by element repeated in S1"); options.define("S2|s2=d:1.0", "scr for sub by element repeated in S2"); options.define("S3|s3=d:1.0", "scr for sub by element repeated in S1 & S2"); opts.define("author=b", "author of program"); opts.define("version=b", "compilation info"); opts.define("example=b", "example usages"); opts.define("help=b", "short description"); opts.process(argc, argv); // handle basic options: if (opts.getBoolean("author")) { cout << "Adapted by Craig Stuart Sapp 2009" << " from the simil program written by Keith Opern 1992" << endl; exit(0); } else if (opts.getBoolean("version")) { cout << argv[0] << ", version: 17 Nov 2009" << endl; cout << "compiled: " << __DATE__ << endl; cout << MUSEINFO_VERSION << endl; exit(0); } else if (opts.getBoolean("help")) { usage(opts.getCommand().data()); exit(0); } else if (opts.getBoolean("example")) { example(); exit(0); } xlen = options.getInteger("x"); reverseQ = options.getBoolean("r"); scalingQ = !options.getBoolean("n"); interp = NULL; if (options.getBoolean("i")) { interp = options.getString("i").data(); } debugQ = options.getBoolean("debug"); rawQ = options.getBoolean("raw"); pweightQ = options.getBoolean("print-weights"); nullQ = options.getBoolean("exclude-null-tokens"); appendQ = options.getBoolean("append"); prependQ = options.getBoolean("prepend"); sequenceQ = options.getBoolean("sequence"); spacesQ = !options.getBoolean("no-spaces"); threshold = options.getDouble("threshold"); if (options.getBoolean("weight-file")) { readEditWeights(options.getString("weight-file").data()); } if (options.getBoolean("R1")) { weight_R1 = options.getDouble("R1"); } if (options.getBoolean("R2")) { weight_R2 = options.getDouble("R2"); } if (options.getBoolean("D1")) { weight_D1 = options.getDouble("D1"); } if (options.getBoolean("D2")) { weight_D2 = options.getDouble("D2"); } if (options.getBoolean("S0")) { weight_S0 = options.getDouble("S0"); } if (options.getBoolean("S1")) { weight_S1 = options.getDouble("S1"); } if (options.getBoolean("S2")) { weight_S2 = options.getDouble("S2"); } if (options.getBoolean("S3")) { weight_S3 = options.getDouble("S3"); } } ////////////////////////////// // // example -- // void example(void) { } ////////////////////////////// // // usage -- // void usage(const char* command) { } /////////////////////////////////////////////////////////////////////////// ////////////////////////////// // // doDamerauLevenshteinAnalysis -- // void doDamerauLevenshteinAnalysis(Array >& sourcedata, Array >& templatedata) { if (xlen <= 0) { Array sresults; usual_thing(sresults, sourcedata, templatedata); printResults(sresults, sourcedata, templatedata); } else { Array > mresults; unusual_thing(mresults, sourcedata, templatedata, xlen); printResultsSubString(mresults, sourcedata, templatedata); } } double normalize1(double d, int len) { return exp(-d/len); } double normalize2(double d, int len) { return exp(-d); } ////////////////////////////// // // unusual_thing -- Do sub-string matching // void unusual_thing(Array >& results, Array >& sourcedata, Array >& templatedata, int sublen) { int i, j; int len = (int)fabs(sourcedata.getSize() - sublen + 1); int subcount = templatedata.getSize() - sublen + 1; results.setSize(0); if (len <= 0) { return; } results.setSize(len); for (i=0; i& list) { int i; int output = 0; if (list.getSize() <= 0) { return -1; } for (i=1; i& list, double target) { int i; int counter = 0; for (i=0; i 1) { cout << ","; } cout << i+1; } } } ////////////////////////////// // // printResultsSubString -- // void printResultsSubString(Array >& results, Array >& sourcedata, Array >& templatedata) { int i; double value; cout << "**simil\t**simxrf" << endl; int len = xlen; if (len <= 0) { len = templatedata.getSize(); } int maxi; for (i=0; i& results, Array >& sourcedata, Array >& templatedata) { int i; double value; cout << "**simil"; if (sequenceQ) { cout << "\t**seq"; } cout << endl; int len = templatedata.getSize(); for (i=0; i >& sourcedata, int index, int size, int flag) { int i; if (size <= 0) { cout << "."; return; } cout << sourcedata[index].getBase(); for (i=index+1; i& results, Array >& sourcedata, Array >& templatedata) { int len = (int)fabs(sourcedata.getSize() - templatedata.getSize() + 1); results.setSize(0); if (len <= 0) { return; } results.setSize(len); results.setAll(0.0); for (int i=0; i using dlvcharstar. // dlvstring = DLV raw value from Array data. // dlvcharstar = DLV raw value from char* data. // dlvint = DLV raw value from int data. (original function) // double dlv(Array >& set1, int offset1, Array >& set2, int offset2) { Array list1; Array list2; if (offset1 < 0) { offset1 = 0; } if (offset2 < 0) { offset2 = 0; } int size1 = set1.getSize() - offset1; int size2 = set2.getSize() - offset2; if ((size1 < 0) || (size2 < 0)) { cerr << "Error in offset values: " << size1 << ", " << size2 << endl; exit(1); } list1.setSize(size1); list2.setSize(size2); int i; for (i=0; i >& set1, int offset1, Array >& set2, int offset2, int temlen = -1) { int len1 = set1.getSize() - offset1; int len2 = set2.getSize() - offset2; if (temlen == -1) { // tem len is the length of the substring to consider temlen = set2.getSize(); } int len = len1; // the length of the sequences to compare if (len1 > len2) { len = len2; } if (temlen < len) { len = temlen; } double cost, val, m; int i, j; int mindex; int s1index, s2index; int n = 0; if (len+1 > n) { n = len + 1; } double min[n]; int rep1; int rep2; mindex = 0; s1index = offset1; m = 0.0; min[mindex] = 0.0; for (i=0; i0) && (strcmp(set1[s1index].getBase(), set1[s1index-1].getBase()) == 0)) { m += weight_R1; } else { m += weight_D1; } min[mindex] = m; } s2index = offset2; for (j=0; j0 && (strcmp(set2[s2index].getBase(), set2[s2index-1].getBase()) == 0)); mindex = 0; cost = min[mindex]; m = cost + (rep2 ? weight_R2 : weight_D2); min[mindex++] = m; s1index = offset1; for (i=0; i0) && (strcmp(set1[s1index].getBase(), set1[s1index-1].getBase()) == 0); m += rep1 ? weight_R1 : weight_D1; if (strcmp(set1[s1index].getBase(),set2[s2index].getBase()) == 0) { val = cost; } else { if (rep1) { val = cost + (rep2? weight_S3:weight_S1); } else { val = cost + (rep2? weight_S2:weight_S0); } } if (val < m) { m = val; } cost = min[mindex]; val = cost + (rep2 ? weight_R2 : weight_D2); if (val < m) { m = val; } min[mindex++] = m; } } return (double)m; } double dlvint(int *s1, int len1, int *s2, int len2) { float cost, val, m; int i, j; float *mptr; int *s1ptr, *s2ptr; int n = 0; float *min = NULL; if (len1+1 > n) { min = new float[len1+1]; n = len1 + 1; } mptr = min; s1ptr = s1; m = *mptr = 0.0; for (i=len1; --i>=0; s1ptr++) { m = *mptr++; if (s1ptr>s1 && *s1ptr==*(s1ptr-1)) { m += weight_R1; } else { m += weight_D1; } *mptr = m; } s2ptr = s2; for (j=len2; --j>=0; s2ptr++) { char rep2 = (s2ptr>s2 && *s2ptr==*(s2ptr-1)); mptr = min; m = (cost = *mptr) + (rep2? weight_R2:weight_D2); *mptr++ = m; s1ptr = s1; for (i=len1; --i>=0; s1ptr++) { char rep1 = (s1ptr>s1 && *s1ptr==*(s1ptr-1)); m += (rep1? weight_R1:weight_D1); if (*s1ptr==*s2ptr) val = cost; else { if (rep1) val = cost + (rep2? weight_S3:weight_S1); else val = cost + (rep2? weight_S2:weight_S0); } if (val < m) { m = val; } val = (cost = *mptr) + (rep2? weight_R2:weight_D2); if (val < m) m = val; *mptr++ = m; } } delete [] min; return (double)m; } double dlvcharstar(char** set1, int len1, char** set2, int len2) { int len = len1; // the length of the sequences to compare if (len1 > len2) { len = len2; } double cost, val, m; int i, j; int mindex; int s1index, s2index; int n = 0; if (len+1 > n) { n = len + 1; } double min[n]; int rep1; int rep2; mindex = 0; s1index = 0; m = 0.0; min[mindex] = 0.0; for (i=0; i0) && (strcmp(set1[s1index], set1[s1index-1]) == 0)) { m += weight_R1; } else { m += weight_D1; } min[mindex] = m; } s2index = 0; for (j=0; j0 && (strcmp(set2[s2index], set2[s2index-1]) == 0)); mindex = 0; cost = min[mindex]; m = cost + (rep2 ? weight_R2 : weight_D2); min[mindex++] = m; s1index = 0; for (i=0; i0) && (strcmp(set1[s1index], set1[s1index-1]) == 0); m += rep1 ? weight_R1 : weight_D1; if (strcmp(set1[s1index],set2[s2index]) == 0) { val = cost; } else { if (rep1) { val = cost + (rep2? weight_S3:weight_S1); } else { val = cost + (rep2? weight_S2:weight_S0); } } if (val < m) { m = val; } cost = min[mindex]; val = cost + (rep2 ? weight_R2 : weight_D2); if (val < m) { m = val; } min[mindex++] = m; } } return (double)m; } // md5sum: 84841e2d00679b77fc833fb9c6b336c9 simil.cpp [20170605]