View Javadoc

1   /*
2    *    This program is free software; you can redistribute it and/or modify
3    *    it under the terms of the GNU General Public License as published by
4    *    the Free Software Foundation; either version 2 of the License, or
5    *    (at your option) any later version.
6    *
7    *    This program is distributed in the hope that it will be useful,
8    *    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10   *    GNU General Public License for more details.
11   *
12   *    You should have received a copy of the GNU General Public License
13   *    along with this program; if not, write to the Free Software
14   *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15   */
16  
17  
18  package jr239.co620;
19  
20  import org.apache.log4j.Logger;
21  
22  
23  import weka.classifiers.Evaluation;
24  import weka.core.Instances;
25  import weka.core.Instance;
26  import weka.classifiers.CostMatrix;
27  import weka.classifiers.Classifier;
28  import weka.classifiers.bayes.NaiveBayes;
29  import weka.core.converters.ArffLoader ;
30  import weka.core.FastVector;
31  
32  import weka.core.Attribute;
33  
34  import weka.gui.LogPanel;
35  import weka.core.converters.ConverterUtils.*;
36  import weka.estimators.EstimatorUtils;
37  
38  
39  import java.util.Random; 
40  import java.util.ArrayList;
41  import java.util.Arrays;
42  import java.io.File;
43  import java.io.IOException;
44  
45  import java.util.HashMap; 
46  import java.util.HashSet;
47  import java.util.Properties;
48  
49  import java.io.FileWriter;
50  
51  import java.io.FileReader;
52  
53  
54  public class SimpleBayesDataProcessor {
55      
56      
57       protected static SimpleBayesDataProcessor getInstance(LogPanel lw){
58           synchronized(SimpleBayesDataProcessor.class) {
59               
60               if (singleInstace == null){
61                   wl = lw;
62                   singleInstace = new SimpleBayesDataProcessor();
63              }
64         }
65           
66           return singleInstace;
67      }
68      //
69      
70      
71      
72      /** =====================================   private side  ============================================== *
73       * @return the initial data set provided
74       */
75      
76      protected static final Instances getInitialDataSet(){
77          return initialDataSet0;
78      }
79      //
80      
81      //private static int[] AttrbValuesCounts = null;
82      
83      protected static final Object[] cehckJAminCount(final int minCountPer100){ // sets   setNormalizedMinCountValue();
84          boolean minCount = true;
85          double r = ((double)minCountPer100)/(100.00 );
86          //long minC = Math.round(r * ((double)joinedAttributesDataSet.numInstances())); 
87          double minC = r * ((double)joinedAttributesDataSet.numInstances()); 
88          Object[] check = new Object[2];
89          int[] AttrbValuesCounts = null;
90          double normalizedMinCount = -9.99; 
91        
92         AttrbValuesCounts = joinedAttributesDataSet.attributeStats(jap).nominalCounts;// joined attrib column is practically hardcoded into first (0). and classinto last. Esier ..
93        normalizedMinCount = setNormalizedMinCountValue( AttrbValuesCounts);
94        
95         for( int valueCountIndex = 0 ;  valueCountIndex < AttrbValuesCounts.length; valueCountIndex ++  ){
96             if( AttrbValuesCounts[valueCountIndex] < minC ){
97                 minCount = false;
98                 break;
99             }
100         }
101       check[0] = minCount;
102       check[1] = normalizedMinCount; 
103        return check;
104     }
105     
106     protected static int getMinCount(){
107         return minValsCount;
108     }
109    
110     
111     private static double  setNormalizedMinCountValue(int[] AttrbValuesCount ){ 
112          double normalizedMinCounts= -9.99;
113         HashSet<Integer> uniqueCountsValues = new HashSet<Integer>();
114         for( int i = 0; i < AttrbValuesCount.length; i++ ){
115             uniqueCountsValues.add(AttrbValuesCount[i]);
116         }
117         Integer[] cl = uniqueCountsValues.toArray(new Integer[uniqueCountsValues.size()]);
118         Arrays.sort(cl);// min count first element
119         normalizedMinCounts = (cl[0]  )   /   ((double) ( joinedAttributesDataSet.numInstances() ) ); //+ cl[cl.length-1]?? (minCount + maxCount) /( 2 * numberRows)
120         //normalizedMinCounts = cl[0]  /  ((double)joinedAttributesDataSet.numInstances()); to avoid all the attrbs reduced to 0 count weight because the min count is zero
121        return normalizedMinCounts;
122     }
123   
124     
125     private static Attribute 
126     getJoinedAttribute( Attribute a, Attribute b){
127         
128         int anv = a.numValues(), bnv = b.numValues();
129         FastVector  jointAttrbNmeVals = new FastVector(anv*bnv);
130        
131         for ( int  av= 0; av < anv ; av++){
132           for( int  bv = 0; bv < bnv; bv++ ){
133               jointAttrbNmeVals.addElement(a.value( av) + JOIN  + b.value(bv) );
134           }//int loop
135        }
136      // ext loop
137     return new Attribute ( (a.name()  +  JOIN  +  b.name()) , 
138                                           jointAttrbNmeVals
139                                         );
140     }// end of methodfromjava.lang.IllegalArgumentException: Can't have more folds than instances!
141 
142     //
143   
144     private static 
145     Attribute  getJoinedAtrrbDef (){
146         
147         Attribute a = initialDataSetAttributesDefinition [ attrsToJoinIndxs[ 0 ] ];
148         Attribute b = initialDataSetAttributesDefinition [ attrsToJoinIndxs[ 1] ];
149         joinedAttrb = getJoinedAttribute(a,b);
150         
151         for( int i = 2; i < ajn ; i++){
152             
153                  a = joinedAttrb ;
154                  b = initialDataSetAttributesDefinition [ attrsToJoinIndxs[ i ] ];
155                  joinedAttrb = getJoinedAttribute(a,b);
156         }
157        return joinedAttrb; 
158     }// end of method
159       
160     private static void 
161         setIndxsAttrbsToJoin (int [] ix ){
162              ajn = ix.length; 
163             attrsToJoinIndxs = new int[ajn];
164             for(int i = 0; i < ajn ; i++){
165                 attrsToJoinIndxs[i] = ix[i];
166             }
167             
168         }
169     
170     
171     private static boolean joinedAttribute(int AttrIndx){
172         
173         for(int i =0; i < ajn; i++ ){
174             if (AttrIndx ==attrsToJoinIndxs[i]) {
175 		return true;
176 	    }
177         }
178         return false;
179     }
180     ///
181     
182     private static  void  
183     setJoinedAttributesDefinition (int[] joinIndxs ){
184         
185         setIndxsAttrbsToJoin(joinIndxs);
186         
187         jna = ina - ajn  +1; // number of attributes, "columns", in the joined attributes data set , one less that in the initial dataset if 2 att are joined.
188         joinedAttributesDefinition = new FastVector(jna);
189         joinedAttributesDefinition.insertElementAt(( getJoinedAtrrbDef()), jap);// joinedAttribute placed at "jap", default index  0  "first column "
190         
191         for( int  i = 1; i < jna; i++){
192            for(int j = 0; j < ina; j++){
193                if( !joinedAttribute(j)){
194                    joinedAttributesDefinition.insertElementAt(initialDataSetAttributesDefinition[j], i);
195                    i++;
196                }//if
197            }//internal loop
198        }//external loop
199     }//end of method
200     //
201     
202     private static Instance
203     getJoinedAttribIndexedValues(String[] joinedValAsString){
204         
205         double[] indexedVals = new double[jna]; 
206         
207         for (int i = 0; i < jna; i++){
208             
209             //double[] instance = new double[m_Data.numAttributes()];
210             int index = joinedAttributesDataSet.attribute(i).indexOfValue(joinedValAsString[i]);
211             //index = m_Data.attribute(i).indexOfValue(m_Tokenizer.sval);
212             assert index > -1:"nominal value not declared in header" ;
213             // if (index == -1) {
214             // errorMessage("nominal value not declared in header");
215             //}
216             indexedVals[i] = (double)index;
217             //instance[i] = (double)index;
218             
219         }
220         Instance iv = new Instance( 1, indexedVals );
221         iv.setDataset(joinedAttributesDataSet);
222         return iv;
223     }
224     //  
225     /////
226       private static String
227     getJoinedValues(int indx){
228         String jv =initialAttributesStringValsSet[indx][attrsToJoinIndxs[0]];
229         for ( int i = 1; i < ajn; i++){
230             jv =jv + JOIN +initialAttributesStringValsSet[indx][attrsToJoinIndxs[i]];
231         }
232         return jv;
233     }
234     //
235    
236      static int getFolds(){
237 	return foldingsNum;
238      }
239   
240     private static  void 
241     setJoinedAttributesStringValues( ){
242         
243         joinedStringValsSet = new String[rowsNumber][jna];
244         
245          for(int i = 0; i < rowsNumber; i++){   // all the rows;" tuples of attributes"
246              
247              String[] theJoinedAttributeValues = new String[jna]; 
248              theJoinedAttributeValues[jap] = getJoinedValues(i);
249              
250               for(int j = 1; j < jna; j ++){// all not joined attributes are kept as they are in thi initial data set.
251                   for(int k = 0; k < ina; k++){// all the columns; each attribute recorded value.
252                      if( ( !joinedAttribute(k)) ){ // skip the joined attributes values ; they are the new joined value placed  as the first attribute value in the joined dataset."first column"
253                          theJoinedAttributeValues[j] = initialAttributesStringValsSet [i][k];// adding the not joined attributes values
254                         j++;//next value in the joined  attributes dataset values.
255                     }//if
256                     ///  else  j--; 
257                  }//k loop
258              }// j loop
259              joinedStringValsSet[i] = theJoinedAttributeValues;
260          }// i loop
261        // return   joinedStringValsSet ;
262     }// end of method
263     //
264  
265    private static void
266    setInitialAttributeStringValues() { 
267        initialAttributesStringValsSet = new String [rowsNumber][ina];
268          for(int row= 0;row< rowsNumber;  row ++){                                               // all the rows;" tuples of attributes"
269              String[] theInitialAttributeStringValues = new String[ina]; 
270              Instance initialAttrbIndexedVals = initialDataSet.instance(row);
271 	     
272               for(int column = 0; column < ina; column ++){                                    // all columns, "attributes", per row
273 		  int  AttrbuteStringValueIndex = (int )initialAttrbIndexedVals.value(column);
274 		   theInitialAttributeStringValues[column] = AttributeIndexToStringValueMapsList.get(column).get(AttrbuteStringValueIndex);    
275 		   //we are getting the string value of this column. Col has index of the value on value list.
276 		     
277              }// column loop
278              initialAttributesStringValsSet[row] = theInitialAttributeStringValues;
279          }// row loop
280     }
281 //
282    
283    
284    private  static void
285     setInitialAttributesDefinition (){
286         HashMap<Integer, String> attrbuteIndexToStringValueMap =null;
287 	int attrbValuesNumber =0;
288 	AttributeIndexToStringValueMapsList = new ArrayList< HashMap<Integer, String> >(ina);
289         initialDataSetAttributesDefinition= new  Attribute[ina] ;
290 	
291         for(int attributeIndex = 0; attributeIndex <ina; attributeIndex ++){
292             initialDataSetAttributesDefinition[attributeIndex] = initialDataSet.attribute(attributeIndex);
293 	    attrbValuesNumber = initialDataSetAttributesDefinition[attributeIndex].numValues();
294 	    attrbuteIndexToStringValueMap = new    HashMap<Integer, String> (attrbValuesNumber);
295 	    
296 	    for(int attrbVal  = 0; attrbVal< attrbValuesNumber; attrbVal++ ){
297 		attrbuteIndexToStringValueMap.put( attrbVal , initialDataSetAttributesDefinition[attributeIndex].value(attrbVal) );
298 	    }//each attrb values
299 	    
300 	    AttributeIndexToStringValueMapsList.add(attributeIndex, attrbuteIndexToStringValueMap);
301           }//each  attribute
302         initialAttrbsValuesCounters = getAttrbValuesCounts(initialDataSet);
303     }
304     //
305     
306     //private static  ArrayList <ArrayList <Integer>> attrbsValuesCounters =  new  ArrayList <ArrayList <Integer>> (); 
307     
308     private static ArrayList <ArrayList <Integer>>  getAttrbValuesCounts (Instances dataSet) {
309         
310         ArrayList <ArrayList <Integer>> attrbsValuesCounters = getInitAttrbsValsCounters(dataSet);
311         int rows = dataSet.numInstances(),
312 		columns = dataSet.numAttributes(); 
313             for(int tupleNumb  = 0; tupleNumb < rows; tupleNumb++){ // rows for each row ; set ot attrbs vals
314                 
315                    for(int attrbNum = 0; attrbNum <columns ; attrbNum++){//cols or attrbs; for each attrb
316                        
317                        int attrbNumVals=dataSet.attribute(attrbNum).numValues() ;
318                         for(int attrValNum = 0;attrValNum < attrbNumVals ;attrValNum++) { // for each value( of each at)
319                             
320                             if( 
321                                     dataSet.instance(tupleNumb).stringValue(attrbNum).equalsIgnoreCase(dataSet.attribute(attrbNum).value(attrValNum)) 
322                                ){ //imcrease counter for the attribute value found
323                                 Integer  attrValCount =  attrbsValuesCounters.get(attrbNum).get(attrValNum);
324                                 attrValCount  =  attrValCount  +  1;
325                                 attrbsValuesCounters.get(attrbNum).set(attrValNum, attrValCount);
326                                 }//if
327                             
328                          }// for each att val count
329                        
330                    }//for each attrb  
331                    
332            }//for each tuple
333 	 
334          return attrbsValuesCounters;
335     }// end of method
336     //
337     
338     protected static boolean  checkJoinedAttributeValsMinCount(  ){
339 	
340 	ArrayList <ArrayList <Integer>>  ac  =   getAttrbValuesCounts (joinedAttributesDataSet);
341 	ArrayList <Integer> a = ac.get(jap);
342 	for (Integer val : a){
343 	    if ( val < minValsCount){
344 		lg.info("The count for value: "  +  joinedAttrb.value(a.indexOf(val) )   + "is: \"" + val +"\" \n" );
345 		return false;
346 	    }
347 	}
348 	 return true;
349     }
350     
351     private static void  setAttrbValsCounts( ArrayList <Integer>attrbValsCounts, int attrbValsNumber){
352         Integer Z= 0;
353 	int counts = attrbValsNumber; // list is empty before adding any element.. even if its  "capacity" has been set. Then using "size" will return 0; 
354         for (int i =0; i < counts ; i++ ){
355            attrbValsCounts.add(Z);
356         }
357     }
358     //
359     
360   
361     private static ArrayList <ArrayList <Integer>>   getInitAttrbsValsCounters(Instances dataSet){    // It seems now that there is not need for an expanding array since all the size are know from the dataset
362 	int  attrbsNum = dataSet.numAttributes();
363         ArrayList <ArrayList <Integer>> counters =  new  ArrayList <ArrayList <Integer>> (attrbsNum);  // List  of list Of counters
364 	int attrbValsNumber = 0;															// N columns. one per Attri. them M counters  on per each value of   the Attrb 
365         for(int z = 0; z < attrbsNum; z++){													    // A B C D  X  
366 	    attrbValsNumber = dataSet.attribute(z).numValues() ;								 // A ={a1, a2} then First colum will have two counters counters[]=int[2]; int[0] will count number of "a1" vals in  data set
367 	  ArrayList <Integer> AttrValsCounts = new ArrayList <Integer>( attrbValsNumber);
368 	  setAttrbValsCounts(AttrValsCounts , attrbValsNumber);												   							
369            counters.add(AttrValsCounts);			 
370         }
371         return counters;
372     }
373 
374     
375     
376     
377     
378     private static void setClasIndex(){
379         classIndex = ina -1; // last attribute "column"
380     }
381     //
382     
383     
384     private static  void
385     setInitialDataSetFromArrfFile(String arrfFileNme){
386         String fileName="data/t0.arff"; 
387         if(!arrfFileNme.equalsIgnoreCase("")) {
388             fileName = arrfFileNme;
389         }
390         File fil =  new File(fileName);
391         ArffLoader L = new  ArffLoader();
392         try {
393             L.setSource(fil);
394             initialDataSet = new Instances(L.getDataSet()) ;
395 	    initialDataSet0 = new Instances( initialDataSet );
396             ina = initialDataSet.numAttributes();
397             setClasIndex();
398             initialDataSet.setClassIndex(classIndex);
399 	    initialDataSet0.setClassIndex(classIndex);
400             rowsNumber = initialDataSet.numInstances();
401 	     SetXvalidtnFoldsNum();
402            }catch ( IOException  e) { 
403                       
404                System.err.println(" arff file not found:" + arrfFileNme +"\n" );
405                System.exit(1);
406            }
407      
408           searchProperties=new Properties();
409         try{
410             FileReader rd = new FileReader(propertiesFilename);searchProperties.load(rd);
411           runID=Integer.parseInt((String)searchProperties.get("runNo"));
412           writtenFileNo =Integer.parseInt((String)searchProperties.get("writtenFileNo"));
413           minValsCount = Integer.parseInt((String)searchProperties.get("minValsCount"));
414            rd.close();
415         }catch(Exception e){
416             System.err.println(e.getMessage());
417         }
418       
419         SLBD =  SLBD_back +                 (getJAstring(initialDataSet0.relationName() ))+                runID+"/";
420         runID++;
421         
422     
423         initialDataSet0ClassifierEvaluation = getDataSetClassifierEvaluation(initialDataSet0); // 
424         setInitialAttributesDefinition ();
425         setMajorityClassCounts();
426         //setInitialAttributeStringValues(); set on the go
427     }
428    //
429   
430     
431 
432    private static void setMajorityClassCounts(){
433         initialDataSet0MaxClassValattrValsCounts = setAttributesValuesCounts(
434                                                                                         maxClassValDataSet0 =   getMajorityClassValueDataSet(initialDataSet0)
435                                                                                      );
436         Integer [] kl = initialDataSet0MaxClassValattrValsCounts.keySet().toArray(new Integer [0]);
437         initialDataSet0MaxClassValAttrValsSortedList = new  Double[kl.length];
438         for(int i =0; i < kl.length; i++){
439             initialDataSet0MaxClassValAttrValsSortedList[i]= (double)kl[i];
440         }
441        
442        
443         Arrays.sort(initialDataSet0MaxClassValAttrValsSortedList);
444         initialDataSet0MaxClassValAttrValsSortedListNormalized = Arrays.copyOf( initialDataSet0MaxClassValAttrValsSortedList, initialDataSet0MaxClassValAttrValsSortedList.length);
445         for(int c = 0 ; c < initialDataSet0MaxClassValAttrValsSortedListNormalized.length; c++){
446             initialDataSet0MaxClassValAttrValsSortedListNormalized[c] = (
447                                                                                                                         initialDataSet0MaxClassValAttrValsSortedListNormalized[c] 
448                                                                                                                                         / 
449                                                                                                                           (1.000 *  maxClassValDataSet0.numInstances() )
450                                                                                                                     );
451         }
452     }
453     //
454     
455     protected static final Double[][] getBaseLineMaxClassCounts(){
456         Double[][]  valsCounts ={ 
457       
458                 initialDataSet0MaxClassValAttrValsSortedList,
459       
460                 initialDataSet0MaxClassValAttrValsSortedListNormalized
461         };
462         return valsCounts;
463     }
464     
465     private static  void
466     setJoinedAttrbIndexedVals(){
467         for(int i = 0; i < rowsNumber; i++){
468             joinedAttributesDataSet.add(getJoinedAttribIndexedValues(joinedStringValsSet[i]));
469         }
470     }
471     //
472     
473     protected static void SetXvalidtnFoldsNum(){
474         if( rowsNumber > 49) {
475 	    foldingsNum = 10;
476 	}
477         else if (rowsNumber < 10) {
478 	    foldingsNum = 2;
479 	}
480         else {
481 	    foldingsNum =( 1 + (rowsNumber / 10) );
482 	}
483     }
484     //
485     
486     private static Evaluation  getDataSetClassifierEvaluation(Instances dataSet){
487         Evaluation e = null;
488         try{
489              e = new Evaluation( dataSet,  new CostMatrix(dataSet.attribute(dataSet.classIndex()).numValues()));
490              e.crossValidateModel( 
491                      classifierSB , dataSet, foldingsNum, randomIntGenerator  
492                      );
493         }catch( Exception ex){
494             System.err.println("something wrong with data set read from" + ex);
495 	    System.exit(AT);
496         }
497         return e;
498     }
499     //
500     
501     private static void testCase( int[] tuples, String[][] expectedVals){
502         for ( int i =0 ; i < tuples.length; i++){
503             for(int j = 0 ; j < jna; j ++){
504                 assert joinedStringValsSet[tuples[i]][ j].endsWith(expectedVals[i][j]);
505             }
506         }
507     }
508     //
509     
510     protected static   void  setJoinedAttributesValuesTest(String arrFileName) {
511         /**
512             TestT0 
513 2 join attrib coluns 1&2, starting at 0
514 joined attrib placed in col 0
515 col 1,2
516 row 0  "b1-c1,a1,d2,x2"
517 row 7	 "b2-c1,a1,d2,x2"
518 row 13 "b2-c1,a3,d1,x2"
519 
520 3 joined
521 col 1,2,3
522 row 0  "b1-c1-d2,a1,x2"
523 row 7	 "b2-c1-d2,a1,x2"
524 row 13 "b2-c1-d1,a3,x2"
525 
526 
527 
528 ===== 14 rows ==
529 @data
530 a1,b1,c1,d2,x2	0
531 a1,b1,c1,d1,x2	1
532 a2,b1,c1,d2,x1	2
533 a3,b2,c1,d2,x1	3
534 a3,b3,c2,d2,x1	4
535 a3,b3,c2,d1,x2	5
536 a2,b3,c2,d1,x1	6
537 a1,b2,c1,d2,x2	7
538 a1,b3,c2,d2,x1	8
539 a3,b2,c2,d2,x1	9
540 a1,b2,c2,d1,x1	10
541 a2,b2,c1,d1,x1	11
542 a2,b1,c2,d2,x1	12
543 a3,b2,c1,d1,x2	13
544 
545 ===
546         
547 **/
548      // ----------------------------------------   
549         int[]  indexes1 = {1,2};
550         //setIndxsAttrbsToJoin(indexes);
551         int[] Tuple ={ 0, 7, 13}; // row to test
552         String[][] expected2 ={
553                        {"b1-c1","a1","d2","x2"} ,  //r0 =
554                        { "b2-c1","a1","d2","x2"} , // //row 7	
555                         {"b2-c1","a3","d1","x2"}   // r13 
556         };
557          setInitialDataSetFromArrfFile( arrFileName);
558 	 setInitialAttributeStringValues();
559          computeJoinedAttrbsWorkingDataSet (indexes1);
560 	 
561         
562  
563           
564        assert joinedStringValsSet.length == rowsNumber: joinedStringValsSet.length ;
565        assert joinedStringValsSet[3].length == jna;
566        testCase(Tuple, expected2);
567        System.out.println(" \n joined2 \n" + joinedAttributesDataSet.toString() );
568    
569       
570         int[]  index2 = {1,2,3};
571         setIndxsAttrbsToJoin(index2);
572         String[][] expected3={
573             {"b1-c1-d2","a1","x2" }, //row 0  
574           { "b2-c1-d2","a1","x2"},  //row 7
575           { "b2-c1-d1","a3","x2"}  // row 13 "    
576         };
577         
578         computeJoinedAttrbsWorkingDataSet (index2);
579     
580         assert joinedStringValsSet.length == rowsNumber: joinedStringValsSet.length ;
581         assert joinedStringValsSet[3].length == jna;
582         testCase(Tuple, expected3);
583         System.out.println(" \n joined3 \n" + joinedAttributesDataSet.toString() );
584        
585     }
586     //
587     
588     private static String getJoinedAttrbsDataSetName(){
589         StringBuilder n = new  StringBuilder( attrsToJoinIndxs.length + "JA[" );
590         for (int i =0 ; i < attrsToJoinIndxs.length-1 ; i++){
591             n.append(attrsToJoinIndxs[i]+"-");
592         }
593         n.append( (attrsToJoinIndxs[attrsToJoinIndxs.length-1] )+ "] Atrb Name=[" );
594         for (int i =0 ; i < attrsToJoinIndxs.length-1 ; i++){
595             n.append(initialDataSet.attribute(attrsToJoinIndxs[i]).name()+ "-");
596         }
597         n.append(initialDataSet.attribute(attrsToJoinIndxs[attrsToJoinIndxs.length-1]).name()+ "] From:");
598         n.append(initialDataSet.relationName()+"\n");
599         
600         
601         return n.toString();
602     }
603     //
604    private static int getJoinedAttrbDataSetClassIndex(){
605        return jna -1;//last attribute or collumn
606    }
607    //
608    
609     private static void
610     setJoinedAttrbsTestingDataSet (int[] joinIndxs){
611         
612         setJoinedAttributesDefinition ( joinIndxs);
613         joinedAttributesDataSet = new Instances( 
614                 getJoinedAttrbsDataSetName(),  
615                 joinedAttributesDefinition, 1 
616                 );
617         joinedAttributesDataSet.setClassIndex(getJoinedAttrbDataSetClassIndex());
618         setJoinedAttributesStringValues ();
619         setJoinedAttrbIndexedVals();
620         //joinedAttributesDataSetClassifierEvaluation = getDataSetClassifierEvaluation(joinedAttributesDataSet);
621     } 
622    
623     private static int antSearchNo=0;
624     private static String actxt="";
625     protected static void setAntContext(String c){
626         //SimpleBayesDataProcessor.setAntContext("Fd"+foldingsNum+"S"+searchNo);
627         actxt = c;
628     }
629      
630     protected static void setAntid(int antNum){
631         antSearchNo = antNum;
632     }
633   
634     protected static final String getSBS(){
635         return SLBD;
636     }
637     
638     private static boolean badChar(char c){
639         boolean bc = false; // bad char fro a file name
640         if (!Character.isDigit(c) &&  !Character.isLetter(c)){
641             bc = true;
642         }
643         return bc;
644     }
645  
646     private static String getJAstring(String jaNme){
647         String n= jaNme.trim();
648         char[] bch = new char[6];
649         int bc= 0;
650         if(n.length() >6 ){
651             n =n.substring( 0, 6);
652         } 
653         for(int x = 0 ; x< n.length(); x++ ) {
654             if (badChar(n.charAt(x))) {
655                 bch[bc] = n.charAt(x);
656                 bc++;
657             }
658         }
659         for(int r = 0 ; r<bc; r++){
660             n = n.replace(bch[r], 'M');
661         }
662         
663         
664          return n; 
665     }
666     
667     protected static void
668     computeJoinedAttrbsWorkingDataSet (int[] joinIndxs){
669         String d = SLBD+"AS/", e="-FID-"+writtenFileNo+"No.arff"; 
670         writtenFileNo++;//update the FID so  saved files are not overwritten in case we need to look at them (arff files, data files..)
671         setJoinedAttributesDefinition ( joinIndxs);
672         joinedAttributesDataSet = new Instances( 
673                 getJoinedAttrbsDataSetName(),  
674                 joinedAttributesDefinition, 1 
675                 );
676         joinedAttributesDataSet.setClassIndex(getJoinedAttrbDataSetClassIndex());
677         setJoinedAttributesStringValues ();
678         
679 	setJoinedAttrbIndexedVals();
680         
681         d=d+"ant"+antSearchNo+"/JA"+actxt+
682                 getJAstring(joinedAttributesDataSet.attribute(jap).name())
683                 +e;
684         
685 	saveDataSet(d, joinedAttributesDataSet);
686         
687 	    joinedAttributesDataSetClassifierEvaluation = getDataSetClassifierEvaluation(joinedAttributesDataSet);
688 	    //lg.debug("\nJoined Attributes Datset created is:\n"+  joinedAttributesDataSet.toString() 
689 		//    + "\n --End of Joined Attributes Datset created details--\n"
690 		  //  );
691 	    if(rowsNumber < 1){ // weks instances in dataset
692 		lw("\nJoined Attributes Datset created is:\n"+  joinedAttributesDataSet.toSummaryString() 
693 		    + "\n --End of Joined Attributes Datset created details--\n"
694 		    );
695 	    }
696 	antSearchNo++;    
697 	//}
698     } 
699     //
700     
701     private static volatile  SimpleBayesDataProcessor singleInstace = new SimpleBayesDataProcessor();
702     private SimpleBayesDataProcessor(){
703     
704     }
705     //
706     //private static Instances initialDatatSetFoldedBack = null;
707     protected static void initColonyEvaluationFolds(){ //this need to be done when colony provides the solution. This is the joined attrbs.
708 	initialDatatSetFolded = new Instances(initialDataSet);
709 	initialDatatSetFolded.randomize(randomIntGenerator);
710 	initialDatatSetFolded.stratify(foldingsNum);
711 	//initialDatatSetFoldedBack = new Instances(initialDatatSetFolded);
712     }
713     //
714     
715     //driven from the loopint through the number of folds by coordinatorAnt
716     //called by before the colony start a new search
717     private static Instances initialTrainingDataSet =null;
718     protected static void setACOworkingSetFold(int workingFoldNumber){
719 	initialTrainingDataSet =  new Instances (
720 		initialDatatSetFolded.trainCV( foldingsNum , workingFoldNumber, randomIntGenerator )
721 		);
722 	initialDataSet = initialTrainingDataSet ;
723 	rowsNumber = initialDataSet.numInstances(); //move the reference of initial set to the working set for the ants to find  just the  rows  they should use
724 	setInitialAttributeStringValues(); // the rows selected as working set. 
725     }							     //The "headers of the data table" are just the names (and domains) of  the attributes . This is" constant"
726 							   //  and is set on setInitialAttributesDefinition() when the data source is readed.
727 	
728 	// called after the ant colony has found the best colony solution
729     /**
730      * 
731      * @param workingFoldNumber
732      * @param AC0joinedAttributes
733      * @return
734      */
735     private static  Instances colonyJoinedAttributesDataSet =null; //colony solution
736     protected static  Instances getACOjoinedAttribsTrainingDataSet(){
737         
738          return colonyJoinedAttributesDataSet  ;
739          
740     }
741     /**
742      * 
743      * @param workingFoldNumber
744      * @param AC0joinedAttributes
745      * @return EvaluationACO
746      */
747     protected static final EvaluationACO setAC0TestingSetFoldGetColonySearchEvaluation( int workingFoldNumber, int[] AC0joinedAttributes){
748 	NaiveBayes bCAco =  new NaiveBayes();
749 	String  dir="", dirb =SLBD+"COS/", e = "+-FID-"+ writtenFileNo+"No.arff";//directory to record data at colony solution level , file extension
750         writtenFileNo++;
751 	//once the colony gets a solution--> joined attrbs three datasets need to be recreated as joined from the initial data set
752 	// to have weka code to evaluate the model
753 	// join attributes in the test fold dataset for testing
754 	initialDataSet  = initialDatatSetFolded.testCV(foldingsNum, workingFoldNumber); // move to the testing fold and build the joinedAtttributes data set for evaluation
755 	rowsNumber = initialDataSet.numInstances(); 
756 	 setInitialAttributeStringValues(); 
757 	 setJoinedAttrbsTestingDataSet(AC0joinedAttributes);
758 	  Instances joinedAttributesTestingDataSet= new Instances(joinedAttributesDataSet);
759           
760           dir=dirb+"JA"+
761                  getJAstring( joinedAttributesTestingDataSet.attribute(0).name() )+
762                   "F"+workingFoldNumber+"/";
763           
764 	 saveDataSet(dir+"testDsFd"+workingFoldNumber+e, joinedAttributesTestingDataSet );
765 	 //traing set for the modelEvaluation at Colony level...with the whole datase 
766 	 initialDataSet = initialTrainingDataSet;
767 	 rowsNumber = initialDataSet.numInstances(); 
768 	 setInitialAttributeStringValues(); 
769 	 setJoinedAttrbsTestingDataSet(AC0joinedAttributes);
770 	Instances joinedAttribsTrainingDataSet   = new Instances(joinedAttributesDataSet);
771         saveDataSet(dir+"traDsFd"+workingFoldNumber+e, joinedAttribsTrainingDataSet );
772 	 
773 	 initialDataSet =  new Instances ( initialDataSet0 ); //restore initial dataset
774 	 rowsNumber = initialDataSet.numInstances(); 
775 	 setInitialAttributeStringValues(); 
776 	 setJoinedAttrbsTestingDataSet(AC0joinedAttributes);
777          colonyJoinedAttributesDataSet = new Instances(joinedAttributesDataSet);
778           saveDataSet(dir+"jaIDSfullFd"+workingFoldNumber+e, colonyJoinedAttributesDataSet );
779          
780          //this dataset is for "init" the EvaluationACO
781 	//every colony solution could be a different joined attribute and therefore a different dataset..
782 	 //Evaluation "has" to be set to with the total dataset, as per colony solution, the dataset is created from joinig the attrbs.
783 	 //Evaluation object is created in normal  weka n bayes outside and drives the cross validation
784 	 //this case is  "a x validation of pottential 10 different solutions from the colony .
785 	 //Data is splitted in woking --goes to colony where ants get best classification from joinig attrbs 
786 	 //the solution is a joined attrbte and the dataset has to be created from IinitalDSet
787 	 //and the evalution created on that...so to cope with different sets different evaluations are needed than then
788 	 //will need to be averaged outside..
789 	try{  
790 	colonyEvaluation = new EvaluationACO(   joinedAttributesDataSet , new CostMatrix(joinedAttributesDataSet.numClasses()) 
791                                                                             ); // EvaluationACO sees the whole set on the joined attributes given by the ant colony
792 	colonyEvaluation.validateACOmodel(bCAco, joinedAttribsTrainingDataSet , joinedAttributesTestingDataSet);
793 	}  catch( Exception ex){
794             System.err.println("something wrong with data set read from" + ex);
795 	}
796         
797          //joinedAttribsTrainingDataSet = null;
798          //joinedAttributesTestingDataSet=null;
799 	 return colonyEvaluation; //keep record of evey fold evaluation,  Every evaluation is embeded with the valiation becaue every fold 
800     }			    //Can be on a differen dataset since the join of attributes change the structure of the dataset,
801 	
802     
803     protected static  int getInitalDataSetAttribsNumber(){
804         return ina; 
805     }
806     //
807     
808     protected static Evaluation  getBaslineEvaluation(){
809         
810          return initialDataSet0ClassifierEvaluation; 
811     }
812     //
813     
814     protected static Evaluation getClassifierEnginePerformanceOnJoinedAttributesDataset (){
815         return joinedAttributesDataSetClassifierEvaluation;
816     }
817     
818     protected static Classifier getClassifierEngine(){
819         
820         return classifierSB;
821     }
822     
823     protected void getAttValsCountTest(String dataFle){
824         setInitialDataSetFromArrfFile(dataFle);
825         initialAttrbsValuesCounters.get(4).get(0);
826         initialAttrbsValuesCounters.get(3).get(1);
827     }
828     
829     protected static void processDataFromSource(String dataSetFile){
830         
831                setInitialDataSetFromArrfFile(dataSetFile);
832     }
833     
834     private static void lw (String output){
835       wl.logMessage(output);
836   }
837     
838    
839     private static void saveDataSet(final String file ,final Instances ds){
840             arffSaver.setInstances(ds);
841            try{
842          //s = new DataSink(file); 
843        arffSaver.setFile(new File( file));
844           arffSaver.writeBatch();
845       }catch(java.lang.Exception e){
846           StringBuilder m = new StringBuilder("System cannot save dataset:\n" + ds.toSummaryString()+"\n to file:\n "+ file);
847          lg.warn(m); 
848          System.err.println(m);
849       }
850     }
851     //
852 
853     private static Instances getMajorityClassValueDataSet(Instances dataSet){
854          classIndex = dataSet.classIndex();//assumed to be the last element
855         if(classIndex !=dataSet.numAttributes() -1  ) {
856             System.err.println("Class is hardcoded into the last colunm at  getMajorityClassValueDataSet(Instances dataSet) \n");
857             System.exit(AT);
858         }
859         int [] classValsCounts= dataSet.attributeStats(classIndex).nominalCounts;
860         
861         HashMap<Integer,Integer> CountToValindex = new HashMap<Integer,Integer>(classValsCounts.length);
862         for ( int c = 0; c< classValsCounts.length  ; c++){ //class index hard coded into the last column assumption
863             CountToValindex.put(classValsCounts[c], c);
864         }
865         Arrays.sort(classValsCounts);
866         Integer maxClassVal =(CountToValindex.get( classValsCounts[classValsCounts.length -1]));
867    
868       
869         Instances maxClassValDataSet =EstimatorUtils.getInstancesFromClass(dataSet, classIndex, maxClassVal);
870         return maxClassValDataSet;
871     }
872      //
873     
874     private static  HashMap<Integer, HashMap<Integer, HashSet<Integer>>>
875     setAttributesValuesCounts(Instances dataSet){
876         HashMap<Integer, HashMap<Integer, HashSet<Integer>>>  counts = new HashMap<Integer, HashMap<Integer, HashSet<Integer>>>();
877         for(int row = 0; row < dataSet.numInstances(); row ++){
878              for ( int col = 0; col < dataSet.numAttributes()-1; col ++){
879                 int[] attrbValsCounts = dataSet.attributeStats(col).nominalCounts;
880                 
881                 for( int valindex= 0; valindex < attrbValsCounts.length; valindex ++ ){ //class index hardcoded into the last  column
882                     
883                      if ( !counts.containsKey(attrbValsCounts[valindex])){ // count  for the attribute value is the  key. From the count  access then attruibute and then attr val index
884                          
885                          HashSet<Integer> attrbVals = new  HashSet<Integer>(); 
886                          attrbVals.add(valindex);
887                          
888                          HashMap<Integer, HashSet<Integer>> attrbToVal = new HashMap<Integer, HashSet<Integer>>();
889                          attrbToVal.put(col, attrbVals);
890                          counts.put( attrbValsCounts[valindex] , attrbToVal  );
891                      }else{
892                          if(!counts.get(attrbValsCounts[valindex]).containsKey(col)) { //if counts maps contains this column(attribute  atrib to attribVals map
893                                 HashMap<Integer, HashSet<Integer>> aTv = counts.get(attrbValsCounts[valindex]);
894                                 HashSet<Integer> aVals=new HashSet<Integer>();
895                                 aVals.add(valindex);
896                                 aTv.put(col, aVals);
897                                 counts.put( attrbValsCounts[valindex] , aTv );
898                          }else{
899                              HashMap<Integer, HashSet<Integer>> aTv = counts.get(attrbValsCounts[valindex]);
900                              HashSet<Integer> aVals=aTv.get(col);
901                              aVals.add(valindex);
902                             aTv.put(col, aVals);
903                             counts.put( attrbValsCounts[valindex] , aTv );
904                          }
905                      }//if-else
906                 }//vaiIndex   
907              }//coluns
908         }//rows
909        
910         return  counts;
911     }//method
912     //  
913       
914     protected static final HashMap<Integer, HashMap<Integer, HashSet<Integer>>>  getbaseLineCountToAttMap(){
915         
916          return initialDataSet0MaxClassValattrValsCounts;
917     }
918     //
919     protected static void finalSerchUpdateSearchPropertiesFile(){
920            searchProperties.put("runNo", Integer.toString(( new Integer (runID))));
921             searchProperties.put("runNo", Integer.toString(( new Integer (writtenFileNo))));
922          try{
923              boolean append =true;
924              FileWriter w = new FileWriter(propertiesFilename,append);
925              searchProperties. store(w, "updates");
926              w.close();
927          }catch(IOException e){
928              System.err.println(e.getMessage());
929          }
930         
931     }
932     
933     
934     
935     //
936 
937    
938      /*   ---------------------------------------- -------------- -------------- fields ----------------------------------------------- */
939     
940     protected  static final Logger lg= Logger.getLogger(SimpleBayesDataProcessor.class);
941        
942     private static  String JOIN = "-" ; // Character to indicate  a jointed attribute or value.
943     
944     //private static String arrFileName = null;
945     
946     private static Instances initialDataSet = null;
947     private static  Attribute[] initialDataSetAttributesDefinition=null;
948     private static String [][] initialAttributesStringValsSet = null;
949     //private static Instance[]  initialAttributesValsSet =null;//
950     
951     private static Instances initialDatatSetFolded = null;
952 					    // initialTrainigWorkingSetFold = null;
953      private static   Instances initialDataSet0 = null;
954     private static EvaluationACO colonyEvaluation= null;
955     
956     
957     private static Instances joinedAttributesDataSet = null;
958     private static FastVector joinedAttributesDefinition = null;
959     private static String[][] joinedStringValsSet = null;
960     private static Attribute joinedAttrb = null;
961     private static  Evaluation joinedAttributesDataSetClassifierEvaluation = null;
962     
963     private static int  jna = 0; //JoinedAttributesDataSet.numAttributes();
964     private static int ina  = 0;  //initial number of attributes 
965     private static int rowsNumber = 0;// initialAttributesValsSet cardinality.  joined attributes dataset must have the same number of "rows but just one colum less" .Same number of recorded data (rows) but with two attirbutes(columns)  now bein one
966     private static int ajn = 0; // number of attributes to be joined    
967     private static int jap = 0 ; // position of the new joined attribute; seems esier to place in the "first column  of the data table"(dataset)
968     private static int classIndex =  -1; // "Column" that contains the class attribute in the data set table. Index start at 0.
969     
970     private static  int[] attrsToJoinIndxs = null; //new int[ajn];
971     
972     private static   Evaluation initialDataSet0ClassifierEvaluation = null;
973     
974     private static   NaiveBayes  classifierSB = new NaiveBayes();
975     private static final Random randomIntGenerator =  new Random();
976     
977     protected static int foldingsNum = 0;
978     private static LogPanel wl =null;
979     private static  int  AT= 1; //ABNORMAL TERMINATION
980     
981     private static  ArrayList <ArrayList <Integer>> initialAttrbsValuesCounters = null;
982   
983     private static ArrayList< HashMap<Integer, String> > AttributeIndexToStringValueMapsList =null;
984     
985     private static HashMap<Integer, HashMap<Integer, HashSet<Integer>>> initialDataSet0MaxClassValattrValsCounts = null;
986     
987     private static Double [] initialDataSet0MaxClassValAttrValsSortedList =null;
988     
989    private static Instances maxClassValDataSet0 = null;
990    private static Double [] initialDataSet0MaxClassValAttrValsSortedListNormalized = null; 
991   
992   // private static Utils ut = new Utils();
993    private static String SLBD ="S/",  SLBD_back = "S/"; //base dir to record ja computations
994    //private static String projectResources =  Main.projectResources;//"resources/";
995    private static String system = Main.system;
996    private static String propertiesFilename =  system  +"search.properties";
997    private static   Properties searchProperties = null;
998       
999    private static int minValsCount =0;
1000    private static int writtenFileNo=-10; // so we know that the searh propeties was not readed ...
1001    private static int runID = -99;
1002    
1003    private static weka.core.converters.ArffSaver arffSaver =  new weka.core.converters.ArffSaver();
1004 }//End of class
1005 ///////////////