View Javadoc
1   /*
2    * Copyright 2008-2009 The Kuali Foundation
3    * 
4    * Licensed under the Educational Community License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    * http://www.opensource.org/licenses/ecl2.php
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.kuali.ole.gl.batch;
17  
18  import java.io.BufferedReader;
19  import java.io.BufferedWriter;
20  import java.io.File;
21  import java.io.FileNotFoundException;
22  import java.io.FileReader;
23  import java.io.FileWriter;
24  import java.io.IOException;
25  import java.util.ArrayList;
26  import java.util.Collections;
27  import java.util.Comparator;
28  import java.util.UUID;
29  
30  import org.apache.commons.io.FileUtils;
31  import org.kuali.ole.sys.OLEConstants;
32  import org.kuali.ole.sys.context.SpringContext;
33  import org.kuali.rice.core.api.config.property.ConfigurationService;
34  
35  /**
36   * This class...
37   */
38  public class BatchSortUtil {
39      private static org.apache.log4j.Logger LOG = org.apache.log4j.Logger.getLogger(BatchSortUtil.class);
40      
41      private static File tempDir;
42  
43      private static File getTempDirectory() {
44          if ( tempDir == null ) {
45              tempDir = new File( SpringContext.getBean(ConfigurationService.class).getPropertyValueAsString(OLEConstants.TEMP_DIRECTORY_KEY) );
46          }
47          return tempDir;
48          }
49  
50      static public void sortTextFileWithFields(String inputFileName, String outputFileName, @SuppressWarnings("rawtypes") Comparator comparator){
51          // create a directory for the interim files
52          String tempSortDirName = UUID.randomUUID().toString();
53          File tempSortDir = new File( getTempDirectory(), tempSortDirName );
54          // ensure the directory is empty
55          FileUtils.deleteQuietly(tempSortDir);
56          try {
57              FileUtils.forceMkdir(tempSortDir);
58          } catch (IOException ex) {
59              LOG.fatal( "Unable to create temporary sort directory", ex );
60              throw new RuntimeException( "Unable to create temporary sort directory", ex );
61          }
62  
63          int numFiles = sortToTempFiles( inputFileName, tempSortDir, comparator );
64  
65          // now that the sort is complete - merge the sorted files
66          mergeFiles(tempSortDir, numFiles, outputFileName, comparator);
67  
68          // remove the temporary sort directory
69          FileUtils.deleteQuietly(tempSortDir);
70      }
71  
72      static int linesPerFile = 10000;
73  
74      /* Code below derived from code originally written by Sammy Larbi and
75       * downloaded from www.codeodor.com.
76       *
77       * http://www.codeodor.com/index.cfm/2007/5/14/Re-Sorting-really-BIG-files---the-Java-source-code/1208
78       */
79      private static int sortToTempFiles(String inputFileName, File tempSortDir, Comparator<String> comparator) {
80          BufferedReader inputFile;
81           try {
82               inputFile = new BufferedReader(new FileReader(inputFileName));
83           } catch ( FileNotFoundException ex ) {
84               LOG.fatal( "Unable to find input file: " + inputFileName, ex );
85               throw new RuntimeException( "Unable to find input file: " + inputFileName, ex );
86           }
87           try {
88               String line = "";
89               ArrayList<String> batchLines = new ArrayList<String>( linesPerFile );
90  
91               int numFiles = 0;
92               while ( line !=null ) {
93                   // get 10k rows
94                   for ( int i = 0; i < linesPerFile; i++ ) {
95                       line = inputFile.readLine();
96                       if ( line != null ) {
97                           batchLines.add(line);
98                       }
99                   }
100                  // sort the rows
101 //                 batchLines = mergeSort(batchLines, comparator);
102                  Collections.sort(batchLines, comparator);
103         
104                  // write to disk
105                  BufferedWriter bw = new BufferedWriter(new FileWriter( new File( tempSortDir,  "chunk_" + numFiles ) ));
106                  for( int i = 0; i < batchLines.size(); i++) {
107                      bw.append(batchLines.get(i)).append('\n');
108                  }
109                  bw.close();
110                  numFiles++;
111                  batchLines.clear(); // empty the array for the next pass
112              }
113              inputFile.close();
114              return numFiles;
115          } catch (Exception ex) {
116              LOG.fatal( "Exception processing sort to temp files.", ex );
117              throw new RuntimeException( ex );
118          }
119     }
120         
121     private static void mergeFiles(File tempSortDir, int numFiles, String outputFileName, Comparator<String> comparator ) {
122         try {
123             ArrayList<FileReader> mergefr = new ArrayList<FileReader>( numFiles );
124             ArrayList<BufferedReader> mergefbr = new ArrayList<BufferedReader>( numFiles );
125             // temp buffer for writing - contains the minimum record from each file
126             ArrayList<String> fileRows = new ArrayList<String>( numFiles );
127 
128             BufferedWriter bw = new BufferedWriter(new FileWriter(outputFileName));
129 
130             boolean someFileStillHasRows = false;
131 
132             // Iterate over all the files, getting the first line in each file
133             for ( int i = 0; i < numFiles; i++) {
134                 // open a file reader for each file
135                 mergefr.add(new FileReader(new File( tempSortDir, "chunk_"+i) ) );
136                 mergefbr.add(new BufferedReader(mergefr.get(i)));
137 
138                 // get the first row
139                 String line = mergefbr.get(i).readLine();
140                 if (line != null) {
141                     fileRows.add(line);
142                     someFileStillHasRows = true;
143                 } else  {
144                     fileRows.add(null);
145                 }
146             }
147             
148             while (someFileStillHasRows) {
149                 String min = null;
150                 int minIndex = 0; // index of the file with the minimum record
151 
152                 // init for later compare - assume the first file has the minimum
153                 String line = fileRows.get(0);
154                 if (line!=null) {
155                     min = line;
156                     minIndex = 0;
157                 } else {
158                     min = null;
159                     minIndex = -1;
160         }
161 
162                 // determine the minimum record of the top lines of each file
163                 // check which one is min
164                 for( int i = 1; i < fileRows.size(); i++ ) {
165                     line = fileRows.get(i);
166                     if ( line != null ) {
167                         if ( min != null ) {
168                             if( comparator.compare(line, min) < 0 ) {
169                                 minIndex = i;
170                                 min = line;
171                             }
172                         } else {
173                             min = line;
174                             minIndex = i;
175                         }
176                     }
177                 }
178 
179                 if (minIndex < 0) {
180                     someFileStillHasRows=false;
181                 } else {
182                     // write to the sorted file
183                     bw.append(fileRows.get(minIndex)).append('\n');
184 
185                     // get another row from the file that had the min
186                     line = mergefbr.get(minIndex).readLine();
187                     if (line != null) {
188                         fileRows.set(minIndex,line);
189                     } else { // file is out of rows, set to null so it is ignored
190                         fileRows.set(minIndex,null);
191                     }
192                 }
193                 // check if one still has rows
194                 for( int i = 0; i < fileRows.size(); i++) {
195                     someFileStillHasRows = false;
196                     if(fileRows.get(i)!=null)  {
197                         if (minIndex < 0) {
198                             throw new RuntimeException( "minIndex < 0 and row found in chunk file " + i + " : " + fileRows.get(i) );
199                         }
200                         someFileStillHasRows = true;
201                         break;
202                     }
203         }
204              
205                 // check the actual files one more time
206                 if (!someFileStillHasRows) {
207                     //write the last one not covered above
208                     for(int i=0; i<fileRows.size(); i++) {
209                         if (fileRows.get(i) == null) {
210                             line = mergefbr.get(i).readLine();
211                             if (line!=null) {
212                                 someFileStillHasRows=true;
213                                 fileRows.set(i,line);
214                             }
215                         }
216                     }
217                 }
218     }
219     
220             // close all the files
221             bw.close();
222             for(BufferedReader br : mergefbr ) {
223                 br.close();
224             }
225             for(FileReader fr : mergefr ) {
226                 fr.close();
227             }
228         } catch (Exception ex) {
229             LOG.error( "Exception merging the sorted files", ex );
230             throw new RuntimeException( "Exception merging the sorted files", ex );
231         }
232    }
233     
234 }