View Javadoc
1   package org.kuali.ole.utility.callnumber;
2   
3   /**
4    * Created with IntelliJ IDEA.
5    * User: ?
6    * Date: 19/2/13
7    * Time: 7:49 PM
8    * To change this template use File | Settings | File Templates.
9    */
10  
11  import com.ibm.icu.lang.UCharacter;
12  import org.apache.commons.lang.StringUtils;
13  import org.marc4j.ErrorHandler;
14  
15  import java.text.DecimalFormat;
16  import java.util.HashMap;
17  import java.util.Map;
18  import java.util.regex.Matcher;
19  import java.util.regex.Pattern;
20  
21  //import com.solrmarc.icu.lang.UCharacter;
22  
23  /**
24   * Call number utility functions for solrmarc
25   *
26   * @author Naomi Dushay, Stanford University
27   */
28  
29  public final class CallNumUtils {
30  
31  
32  // TODO:  should have LCcallnum and DeweyCallnum classes, with the call number
33  //   pieces as fields.  Then parsing would happen once per call number, not
34  //   all over the place and some parsing repeated.
35  
36      /**
37       * Default Constructor: private, so it can't be instantiated by other objects
38       */
39      private CallNumUtils() {
40      }
41  
42      public static final Pattern DEWEY_PATTERN = Pattern.compile("^\\d{1,3}(\\.\\d+)?.*");
43      /**
44       * regular expression string for the required portion of the LC classification
45       * LC classification is
46       * 1-3 capital letters followed by  float number (may be an integer)
47       * optionally followed by a space and then a year or other number,
48       * e.g. "1987" "15th"
49       * LC call numbers can't begin with I, O, W, X, or Y
50       */
51      public static final String LC_CLASS_REQ_REGEX = "[A-Z&&[^IOWXY]]{1}[A-Z]{0,2} *\\d+(\\.\\d+)?";
52  
53      /**
54       * non-cutter text that can appear before or after cutters
55       */
56      public static final String NOT_CUTTER = "([\\da-z]\\w*)|([A-Z]\\D+[\\w]*)";
57  
58      /**
59       * the full LC classification string (can have an optional suffix after LC class)
60       */
61      public static final String LC_CLASS_W_SUFFIX = "(" + LC_CLASS_REQ_REGEX + "( +" + NOT_CUTTER + ")?)";
62  
63      /**
64       * regular expression string for the cutter, without preceding characters
65       * (such as the "required" period, which is sometimes missing, or spaces).
66       * A Cutter is a single letter followed by digits.
67       */
68      public static final String CUTTER_REGEX = "[A-Z]\\d+";
69  
70      /**
71       * the full LC classification string, followed by the first cutter
72       */
73      public static final String LC_CLASS_N_CUTTER = LC_CLASS_W_SUFFIX + " *\\.?" + CUTTER_REGEX;
74      public static final Pattern LC_CLASS_N_CUTTER_PATTERN = Pattern.compile(LC_CLASS_N_CUTTER + ".*");
75  
76      /**
77       * regular expression for Dewey classification.
78       * Dewey classification is a three digit number (possibly missing leading
79       * zeros) with an optional fraction portion.
80       */
81      public static final String DEWEY_CLASS_REGEX = "\\d{1,3}(\\.\\d+)?";
82  
83      /**
84       * Dewey cutters start with a letter, followed by a one to three digit
85       * number. The number may be followed immediately (i.e. without space) by
86       * letters, or followed first by a space and then letters.
87       */
88      public static final String DEWEY_MIN_CUTTER_REGEX = "[A-Z]\\d{1,3}";
89      public static final String DEWEY_CUTTER_TRAILING_LETTERS_REGEX = DEWEY_MIN_CUTTER_REGEX + "[A-Z]+";
90      public static final String DEWEY_CUTTER_SPACE_TRAILING_LETTERS_REGEX = DEWEY_MIN_CUTTER_REGEX + " +[A-Z]+";
91      public static final String DEWEY_FULL_CUTTER_REGEX = DEWEY_MIN_CUTTER_REGEX + " *[A-Z]*+";
92  
93      /**
94       * the full Dewey classification string, followed by the first cutter
95       */
96      public static final String DEWEY_CLASS_N_CUTTER_REGEX = DEWEY_CLASS_REGEX + " *\\.?" + DEWEY_FULL_CUTTER_REGEX;
97      public static final Pattern DEWEY_CLASS_N_CUTTER_PATTERN = Pattern.compile(DEWEY_CLASS_N_CUTTER_REGEX + ".*");
98  
99      /**
100      * regular expression string for complete SuDoc classification
101      * Splits the based on continuous numbers and alphabets
102      * Ignore any special char and spaces.
103      */
104     public static final String SUDOC_REGEX = "[^A-Z0-9]+|(?<=[A-Z])(?=[0-9])|(?<=[0-9])(?=[A-Z])";
105 
106 
107     private static Map<Character, Character> alphanumReverseMap = new HashMap<Character, Character>();
108 
109     static {
110         alphanumReverseMap.put('0', 'Z');
111         alphanumReverseMap.put('1', 'Y');
112         alphanumReverseMap.put('2', 'X');
113         alphanumReverseMap.put('3', 'W');
114         alphanumReverseMap.put('4', 'V');
115         alphanumReverseMap.put('5', 'U');
116         alphanumReverseMap.put('6', 'T');
117         alphanumReverseMap.put('7', 'S');
118         alphanumReverseMap.put('8', 'R');
119         alphanumReverseMap.put('9', 'Q');
120         alphanumReverseMap.put('A', 'P');
121         alphanumReverseMap.put('B', 'O');
122         alphanumReverseMap.put('C', 'N');
123         alphanumReverseMap.put('D', 'M');
124         alphanumReverseMap.put('E', 'L');
125         alphanumReverseMap.put('F', 'K');
126         alphanumReverseMap.put('G', 'J');
127         alphanumReverseMap.put('H', 'I');
128         alphanumReverseMap.put('I', 'H');
129         alphanumReverseMap.put('J', 'G');
130         alphanumReverseMap.put('K', 'F');
131         alphanumReverseMap.put('L', 'E');
132         alphanumReverseMap.put('M', 'D');
133         alphanumReverseMap.put('N', 'C');
134         alphanumReverseMap.put('O', 'B');
135         alphanumReverseMap.put('P', 'A');
136         alphanumReverseMap.put('Q', '9');
137         alphanumReverseMap.put('R', '8');
138         alphanumReverseMap.put('S', '7');
139         alphanumReverseMap.put('T', '6');
140         alphanumReverseMap.put('U', '5');
141         alphanumReverseMap.put('V', '4');
142         alphanumReverseMap.put('W', '3');
143         alphanumReverseMap.put('X', '2');
144         alphanumReverseMap.put('Y', '1');
145         alphanumReverseMap.put('Z', '0');
146     }
147 
148 
149     /**
150      * this character will sort first
151      */
152     public static char SORT_FIRST_CHAR = Character.MIN_VALUE;
153     public static StringBuilder reverseDefault = new StringBuilder(75);
154 
155     static {
156         for (int i = 0; i < 50; i++)
157 // N.B.:  this char is tough to deal with in a variety of contexts.
158 // Hopefully diacritics and non-latin won't bite us in the butt.
159 //          reverseDefault.append(Character.toChars(Character.MAX_CODE_POINT));
160             reverseDefault.append(Character.toChars('~'));
161     }
162 
163 //------ public methods --------
164 
165     /**
166      * given a possible Library of Congress call number value, determine if it
167      * matches the pattern of an LC call number
168      */
169     public static final boolean isValidLC(String possLCval) {
170         if (possLCval != null && LC_CLASS_N_CUTTER_PATTERN.matcher(possLCval.trim()).matches())
171             return true;
172         return false;
173     }
174 
175     /**
176      * given a possible Dewey call number value, determine if it
177      * matches the pattern of an Dewey call number
178      */
179     public static final boolean isValidDeweyWithCutter(String possDeweyVal) {
180         if (possDeweyVal != null && DEWEY_CLASS_N_CUTTER_PATTERN.matcher(possDeweyVal.trim()).matches())
181             return true;
182         return false;
183     }
184 
185     /**
186      * given a possible Dewey call number value, determine if it
187      * matches the pattern of an Dewey call number
188      */
189     public static final boolean isValidDewey(String possDeweyVal) {
190         if (possDeweyVal != null && DEWEY_PATTERN.matcher(possDeweyVal.trim()).matches())
191             return true;
192         return false;
193     }
194 
195     /**
196      * return the portion of the call number string that occurs before the
197      * Cutter, NOT including any class suffixes occuring before the cutter
198      */
199     public static final String getPortionBeforeCutter(String callnum) {
200 
201         // cutter is a single letter followed by digits.
202         // there may be a space before a cutter
203         // there should be a period, which is followed by a single letter
204         //   the period is sometimes missing
205         // For Dewey callnumber, there may be a slash instead of a cutter,
206         //  or there might be NO cutter
207         String beginCutterRegex = "( +|(\\.[A-Z])| */)";
208 
209         String[] pieces = callnum.split(beginCutterRegex);
210         if (pieces.length == 0 || pieces[0] == null || pieces[0].length() == 0)
211             return null;
212         else
213             return pieces[0].trim();
214     }
215 
216     /**
217      * return the portion of the LC call number string that occurs before the
218      * Cutter.
219      */
220     public static final String getLCB4FirstCutter(String callnum) {
221         String result = null;
222 
223         String cutter = getFirstLCcutter(callnum);
224         if (cutter != null && cutter.length() > 0) {
225             // lc class can start with same chars as first cutter: (G384 G3)
226             int ix = callnum.indexOf(cutter);
227             String lets = getLCstartLetters(callnum);
228             if (ix < lets.length())
229                 ix = callnum.indexOf(cutter, lets.length());
230 
231             if (ix > 0) {
232                 result = callnum.substring(0, ix).trim();
233                 if (result.endsWith("."))
234                     result = result.substring(0, result.length() - 1).trim();
235             } else
236                 result = callnum;
237         } else // no cutter
238             result = callnum;
239 
240         return result;
241     }
242 
243     /**
244      * Given a raw LC call number, return the initial letters (before any
245      * numbers)
246      */
247     public static String getLCstartLetters(String rawLCcallnum) {
248         String result = null;
249         if (rawLCcallnum != null && rawLCcallnum.length() > 0) {
250             String[] lcClass = rawLCcallnum.split("[^A-Z]+");
251             if (lcClass.length > 0)
252                 result = lcClass[0];
253         }
254         return result;
255     }
256 
257     /**
258      * return the numeric portion of the required portion of the LC classification.
259      * LC classification requires
260      * 1-3 capital letters followed by  float number (may be an integer)
261      *
262      * @param rawLCcallnum
263      */
264     public static String getLCClassDigits(String rawLCcallnum) {
265         String result = null;
266 
267         String rawClass = getLCB4FirstCutter(rawLCcallnum);
268         if (rawClass != null && rawClass.length() > 0) {
269             String[] pieces = rawClass.split("[A-Z ]+");
270             if (pieces.length > 1)
271                 result = pieces[1].trim();
272         }
273         return result;
274     }
275 
276     /**
277      * return the string between the LC class number and the cutter, if it
278      * starts with a digit, null otherwise
279      *
280      * @param rawLCcallnum - the entire LC call number, as a string
281      */
282     public static String getLCClassSuffix(String rawLCcallnum) {
283         String result = null;
284 
285         String b4cutter = getLCB4FirstCutter(rawLCcallnum);
286         if (b4cutter == null || b4cutter.length() == 0)
287             return null;
288 
289         String classDigits = getLCClassDigits(rawLCcallnum);
290 
291         if (classDigits != null && classDigits.length() > 0) {
292             int reqClassLen = b4cutter.indexOf(classDigits) + classDigits.length();
293 
294             if (b4cutter.length() > reqClassLen)
295                 result = b4cutter.substring(reqClassLen).trim();
296         }
297 
298         return result;
299     }
300 
301     /**
302      * return the first cutter in the LC call number, without the preceding
303      * characters (such as the "required" period, which is sometimes missing,
304      * or spaces), or any suffixes
305      *
306      * @param rawCallnum - the entire call number, as a string
307      */
308     public static String getFirstLCcutter(String rawCallnum) {
309         String result = null;
310 
311         String regex = LC_CLASS_W_SUFFIX + " *\\.?(" + CUTTER_REGEX + ")";
312         Pattern pattern = Pattern.compile(regex);
313         Matcher matcher = pattern.matcher(rawCallnum);
314 
315         if (matcher.find())
316             result = matcher.group(6).trim();
317 
318         // if no well formed cutter, take the chunk after last period or space
319         //  if it begins with a letter
320 //        if (result == null) {
321 //            int i = rawCallnum.trim().lastIndexOf('.');  // period
322 //            if (i == -1)
323 //                i = rawCallnum.trim().lastIndexOf(' ');  // space
324 //            if (rawCallnum.trim().length() > i + 1) {
325 //                String possible = rawCallnum.trim().substring(i + 1).trim();
326 //                if (Character.isLetter(possible.charAt(0)))
327 //                    result = possible;
328 //            }
329 //        }
330 
331         return result;
332     }
333 
334     /**
335      * return the suffix after the first cutter, if there is one.  This occurs
336      * before the second cutter, if there is one.
337      *
338      * @param rawLCcallnum - the entire LC call number, as a string
339      */
340     public static String getFirstLCcutterSuffix(String rawLCcallnum) {
341         String result = null;
342 
343         String regex = LC_CLASS_N_CUTTER + " *(" + NOT_CUTTER + ")*";
344         Pattern pattern = Pattern.compile(regex);
345         Matcher matcher = pattern.matcher(rawLCcallnum);
346 
347         // non cutter string optionally followed by cutter preceded by a period
348         if (matcher.find() && matcher.groupCount() > 5
349                 && matcher.group(6) != null && matcher.group(6).length() > 0) {
350 
351             // this only grabs the FIRST non-cutter string it encounters after
352             //   the first cutter
353             result = matcher.group(6).trim();
354 
355             // this is to cope with additional non-cutter strings after the
356             //  first cutter  (e.g. M211 .M93 K.240 1988)
357             int endLastIx = matcher.end(6); // end of previous match
358             if (endLastIx < rawLCcallnum.length()) {
359                 // if there is a suffix, there must be a period before second cutter
360                 Pattern cutterPat = Pattern.compile(" *\\." + CUTTER_REGEX);
361                 matcher.usePattern(cutterPat);
362                 if (matcher.find(endLastIx)) {
363                     if (endLastIx < matcher.start())
364                         result = result.trim() + " " + rawLCcallnum.substring(endLastIx, matcher.start()).trim();
365                 } else
366                     result = result + rawLCcallnum.substring(endLastIx);
367             }
368         } else {
369             // string after first cutter looks like a second cutter, but is
370             //  not because further on there is a second cutter preceded by
371             //  a period.
372             // look for period before second cutter
373             String afterLCclassNCutter = rawLCcallnum.replaceFirst(LC_CLASS_N_CUTTER + " *", "");
374             String cutterRegex = LC_CLASS_N_CUTTER + " *(.*)\\." + CUTTER_REGEX;
375 
376             pattern = Pattern.compile(cutterRegex);
377             matcher = pattern.matcher(rawLCcallnum);
378 
379             if (matcher.find() && matcher.groupCount() > 5
380                     && matcher.group(6) != null && matcher.group(6).length() > 0)
381                 // there is a second cutter preceded by a period
382                 result = matcher.group(6).trim();
383             else {
384                 regex = LC_CLASS_N_CUTTER + " \\.\\.\\.$";
385                 pattern = Pattern.compile(regex);
386                 matcher = pattern.matcher(rawLCcallnum);
387                 if (matcher.find())
388                     result = " ...";
389             }
390         }
391         return result;
392     }
393 
394     /**
395      * return the second cutter in the call number, without the preceding
396      * characters (such as the "required" period, which is sometimes missing,
397      * or spaces), or any suffixes
398      *
399      * @param rawLCcallnum - the entire call number, as a string
400      */
401     public static String getSecondLCcutter(String rawLCcallnum) {
402         String result = null;
403 
404         String firstCutSuffix = getFirstLCcutterSuffix(rawLCcallnum);
405         if (firstCutSuffix == null || firstCutSuffix.length() == 0) {
406             // look for second cutter
407             String regex = LC_CLASS_N_CUTTER + " *\\.?(" + CUTTER_REGEX + ")";
408             Pattern pattern = Pattern.compile(regex);
409             Matcher matcher = pattern.matcher(rawLCcallnum);
410             if (matcher.find() && matcher.groupCount() > 5
411                     && matcher.group(6) != null && matcher.group(6).length() > 0) {
412                 result = matcher.group(6).trim();
413             }
414         } else {
415             // get the text AFTER the first cutter suffix, then parse out
416             //   cutter text from any potential following text.
417             int ix = rawLCcallnum.indexOf(firstCutSuffix) + firstCutSuffix.length();
418             if (ix < rawLCcallnum.length()) {
419                 String remaining = rawLCcallnum.substring(ix).trim();
420                 Pattern pattern = Pattern.compile("(" + CUTTER_REGEX + ")");
421                 Matcher matcher = pattern.matcher(remaining);
422                 if (matcher.find() && matcher.group(1) != null && matcher.group(1).length() > 0) {
423                     result = matcher.group(1).trim();
424                 }
425             }
426             // if we still have nothing, look for 2nd cutter in first cutter suffix
427             if (result == null) {
428                 Pattern pattern = Pattern.compile("\\.(" + CUTTER_REGEX + ")");
429                 Matcher matcher = pattern.matcher(firstCutSuffix);
430                 if (matcher.find() && matcher.group(1) != null && matcher.group(1).length() > 0) {
431                     result = matcher.group(1).trim();
432                 }
433             }
434         }
435         return result;
436     }
437 
438     /**
439      * return the suffix after the first cutter, if there is one.  This occurs
440      * before the second cutter, if there is one.
441      *
442      * @param rawLCcallnum - the entire LC call number, as a string
443      */
444     public static String getSecondLCcutterSuffix(String rawLCcallnum) {
445         String result = null;
446 
447         String secondCutter = getSecondLCcutter(rawLCcallnum);
448         if (secondCutter != null && secondCutter.length() > 0) {
449             // get the call number after the 2nd cutter
450             int ix = rawLCcallnum.indexOf(secondCutter) + secondCutter.length();
451             if (ix < rawLCcallnum.length())
452                 result = rawLCcallnum.substring(ix).trim();
453         }
454 
455         return result;
456     }
457 
458     /**
459      * return the suffix after the first cutter, if there is one.  This occurs
460      * before the second cutter, if there is one.
461      *
462      * @param rawLCcallnum - the entire LC call number, as a string
463      * @deprecated
464      */
465 // do we want to separate out year suffixes?  for all or just here? - unused
466     public static String getSecondLCcutterYearSuffix(String rawLCcallnum) {
467         String result = null;
468 
469         String regex = LC_CLASS_N_CUTTER + " *(" + NOT_CUTTER + ")*";
470         Pattern pattern = Pattern.compile(regex);
471         Matcher matcher = pattern.matcher(rawLCcallnum);
472 
473         if (matcher.find() && matcher.groupCount() > 5
474                 && matcher.group(6) != null && matcher.group(6).length() > 0) {
475 
476             // this only grabs the FIRST non-cutter string it encounters after
477             //   the first cutter
478             result = matcher.group(6);
479 
480             // this is to cope with additional non-cutter strings after the
481             //  first cutter  (e.g. M211 .M93 K.240 1988)
482             int endLastIx = matcher.end(6); // end of previous match
483             if (endLastIx < rawLCcallnum.length()) {
484                 Pattern cutterPat = Pattern.compile(" *\\.?" + CUTTER_REGEX + ".*");
485                 matcher.usePattern(cutterPat);
486                 if (matcher.find(endLastIx)) {
487                     if (endLastIx < matcher.start())
488                         result = result.trim() + " " + rawLCcallnum.substring(endLastIx, matcher.start()).trim();
489                 } else
490                     result = result.trim() + rawLCcallnum.substring(endLastIx);
491             }
492         }
493 
494         return result;
495     }
496 
497     /**
498      * return the portion of the Dewey call number string that occurs before the
499      * Cutter.
500      */
501     public static final String getDeweyB4Cutter(String callnum) {
502         String result = null;
503 
504         String entireCallNumRegex = "(" + DEWEY_CLASS_REGEX + ").*";
505         Pattern pattern = Pattern.compile(entireCallNumRegex);
506         Matcher matcher = pattern.matcher(callnum);
507         if (matcher.find())
508             result = matcher.group(1).trim();
509 
510         return result;
511     }
512 
513     /**
514      * return the first cutter in the call number, without the preceding
515      * characters (such as the "required" period, which is sometimes missing,
516      * or spaces).
517      *
518      * @param rawCallnum - the entire call number, as a string
519      */
520     public static String getDeweyCutter(String rawCallnum) {
521         String result = null;
522 
523         // dewey cutters can have trailing letters, preceded by a space or not
524         String regex1 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_CUTTER_TRAILING_LETTERS_REGEX + ")( +" + NOT_CUTTER + ".*)";
525         String regex2 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_MIN_CUTTER_REGEX + ")( +" + NOT_CUTTER + ".*)";
526         String regex3 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_CUTTER_SPACE_TRAILING_LETTERS_REGEX + ")( +" + NOT_CUTTER + ".*)";
527         String regex4 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_CUTTER_TRAILING_LETTERS_REGEX + ")(.*)";
528         String regex5 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_MIN_CUTTER_REGEX + ")(.*)";
529         String regex6 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_CUTTER_SPACE_TRAILING_LETTERS_REGEX + ")(.*)";
530         Pattern pat1 = Pattern.compile(regex1);
531         Pattern pat2 = Pattern.compile(regex2);
532         Pattern pat3 = Pattern.compile(regex3);
533         Pattern pat4 = Pattern.compile(regex4);
534         Pattern pat5 = Pattern.compile(regex5);
535         Pattern pat6 = Pattern.compile(regex6);
536 
537         Matcher matcher = pat1.matcher(rawCallnum);
538         if (!matcher.find()) {
539             matcher = pat2.matcher(rawCallnum);
540             if (!matcher.find()) {
541                 matcher = pat3.matcher(rawCallnum);
542             }
543         }
544 
545         if (matcher.find()) {
546             String cutter = matcher.group(2);
547             String suffix = matcher.group(3);
548             if (suffix.length() == 0)
549                 result = cutter.trim();
550             else {
551                 // check if there are letters in the cutter that should be assigned
552                 //  to the suffix
553                 if (suffix.startsWith(" ") || cutter.endsWith(" "))
554                     result = cutter.trim();
555                 else {
556                     int ix = cutter.lastIndexOf(' ');
557                     if (ix != -1)
558                         result = cutter.substring(0, ix);
559                     else
560                         result = cutter.trim();
561                 }
562             }
563         } else {
564             matcher = pat4.matcher(rawCallnum);
565             if (matcher.find())
566                 result = matcher.group(2);
567             else {
568                 matcher = pat5.matcher(rawCallnum);
569                 if (matcher.find())
570                     result = matcher.group(2);
571                 else {
572                     matcher = pat6.matcher(rawCallnum);
573                     if (matcher.find())
574                         result = matcher.group(2);
575                 }
576             }
577         }
578         if (result != null)
579             return result.trim();
580         return result;
581     }
582 
583     /**
584      * return suffix to the first cutter in the dewey call number
585      *
586      * @param rawCallnum - the entire call number, as a string
587      */
588     public static String getDeweyCutterSuffix(String rawCallnum) {
589         if (rawCallnum == null || rawCallnum.length() == 0)
590             return null;
591         String result = null;
592 
593         String cutter = getDeweyCutter(rawCallnum);
594         if (cutter != null) {
595             int ix = rawCallnum.indexOf(cutter) + cutter.length();
596             result = rawCallnum.substring(ix).trim();
597         }
598 
599         if (result == null || result.length() == 0) {
600             // dewey cutters can have trailing letters, preceded by a space or not
601             String regex1 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_CUTTER_TRAILING_LETTERS_REGEX + ")( +" + NOT_CUTTER + ".*)";
602             String regex2 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_MIN_CUTTER_REGEX + ")( +" + NOT_CUTTER + ".*)";
603             String regex3 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_CUTTER_SPACE_TRAILING_LETTERS_REGEX + ")( +" + NOT_CUTTER + ".*)";
604             String regex4 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_CUTTER_TRAILING_LETTERS_REGEX + ")(.*)";
605             String regex5 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_MIN_CUTTER_REGEX + ")(.*)";
606             String regex6 = DEWEY_CLASS_REGEX + " *\\.?(" + DEWEY_CUTTER_SPACE_TRAILING_LETTERS_REGEX + ")(.*)";
607             Pattern pat1 = Pattern.compile(regex1);
608             Pattern pat2 = Pattern.compile(regex2);
609             Pattern pat3 = Pattern.compile(regex3);
610             Pattern pat4 = Pattern.compile(regex4);
611             Pattern pat5 = Pattern.compile(regex5);
612             Pattern pat6 = Pattern.compile(regex6);
613 
614             Matcher matcher = pat1.matcher(rawCallnum);
615             if (!matcher.find()) {
616                 matcher = pat2.matcher(rawCallnum);
617                 if (!matcher.find()) {
618                     matcher = pat3.matcher(rawCallnum);
619                     if (!matcher.find()) {
620                         matcher = pat4.matcher(rawCallnum);
621                         if (!matcher.find()) {
622                             matcher = pat5.matcher(rawCallnum);
623                             if (!matcher.find()) {
624                                 matcher = pat6.matcher(rawCallnum);
625                             }
626                         }
627                     }
628                 }
629             }
630 
631             if (matcher.find(0)) {
632                 cutter = matcher.group(2);
633                 String suffix = matcher.group(3);
634                 if (suffix.trim().length() > 0) {
635                     // check if there are letters in the cutter that should be assigned
636                     //  to the suffix
637                     if (suffix.startsWith(" ") || cutter.endsWith(" "))
638                         result = suffix;
639                     else {
640                         int ix = cutter.lastIndexOf(' ');
641                         if (ix != -1)
642                             result = cutter.substring(ix) + suffix;
643                         else
644                             result = suffix;
645                     }
646                 }
647             }
648         }
649         if (result != null)
650             result = result.trim();
651         if (result == null || result.trim().length() == 0)
652             return null;
653         else
654             return result;
655     }
656 
657 
658     /**
659      * Used to improve call num sorting and volume lopping.
660      * Remove leading and trailing whitespace, ensure whitespace is always a
661      * single space, remove spaces after periods, remove trailing periods
662      *
663      * @param rawCallnum - a non-null String containing a Dewey call number
664      * @return normalized form of a call number
665      */
666     public static String normalizeCallnum(String rawCallnum) {
667 
668         // reduce multiple whitespace chars to a single space
669         String normalizedCallnum = rawCallnum.trim().replaceAll("\\s\\s+", " ");
670         // reduce double periods to a single period
671         normalizedCallnum = normalizedCallnum.replaceAll("\\. \\.", " .");
672         // remove space after a period if period is after digits and before letters
673         normalizedCallnum = normalizedCallnum.replaceAll("(\\d+\\.) ([A-Z])", "$1$2");
674         // remove trailing period and any spaces before it
675         if (normalizedCallnum.endsWith("."))
676             normalizedCallnum = normalizedCallnum.substring(0, normalizedCallnum.length() - 1).trim();
677 
678         // cutter could be missing preceding period, but we are leaving that as is
679 
680         // there should be a single space before the cutter - the above should
681         //  ensure this in nearly all cases
682         return normalizedCallnum;
683     }
684 
685     /**
686      * reduce multiple whitespace to single, remove spaces before or after
687      * periods, remove spaces between letters and class digits
688      */
689     static String normalizeLCcallnum(String rawLCcallnum) {
690         String normCallnum = normalizeCallnum(rawLCcallnum);
691         // remove space between class letters and digits
692         return normCallnum.replaceAll("^([A-Z][A-Z]?[A-Z]?) ([0-9])", "$1$2");
693     }
694 
695 
696 // TODO:  method to normalize year and immediate following chars (no space)?   <-- stupid?
697 
698     /**
699      * given a raw LC call number, return the shelf key - a sortable version
700      * of the call number
701      */
702     public static String getLCShelfkey(String rawLCcallnum, String recid) {
703         return (getLCShelfkey(rawLCcallnum, recid, null));
704     }
705 
706     /**
707      * given a raw LC call number, return the shelf key - a sortable version
708      * of the call number
709      */
710     public static String getLCShelfkey(String rawLCcallnum, String recid, ErrorHandler errors) {
711         StringBuilder resultBuf = new StringBuilder();
712         String upcaseLCcallnum = rawLCcallnum.toUpperCase();
713 
714 // TODO: don't repeat same parsing -- some of these methods could take the
715 //   portion of the callnumber before the cutter as the input string.
716 
717         // pad initial letters with trailing blanks to be 4 chars long
718         StringBuilder initLetBuf = new StringBuilder("    ");
719         String lets = getLCstartLetters(upcaseLCcallnum);
720         if (lets != null) {
721             initLetBuf.replace(0, lets.length(), lets);
722         } else {
723             if ((recid != null) && (!rawLCcallnum.startsWith("XX"))) // Stanford mod
724             {
725                 if (errors == null) {
726                     System.err.println("Problem creating shelfkey for record " + recid + "; call number: " + rawLCcallnum);
727                 } else {
728                     errors.addError(ErrorHandler.ERROR_TYPO, "Problem creating shelfkey for record " + recid + "; call number: " + rawLCcallnum);
729                 }
730             }
731             return (rawLCcallnum + " ");
732         }
733         resultBuf.append(initLetBuf);
734 
735         try {
736             // normalize first numeric portion to a constant length:
737             //  four digits before decimal, 6 digits after
738             String digitStr = getLCClassDigits(upcaseLCcallnum);
739             if (digitStr != null)
740                 resultBuf.append(normalizeFloat(digitStr, 4, 6));
741             else
742                 resultBuf.append(normalizeFloat("0", 4, 6));
743 
744             // optional string b/t class and first cutter
745             String classSuffix = getLCClassSuffix(upcaseLCcallnum);
746             if (classSuffix != null)
747                 resultBuf.append(" " + normalizeSuffix(classSuffix));
748 
749             // normalize first cutter  - treat number as a fraction
750             String firstCutter = getFirstLCcutter(upcaseLCcallnum);
751             if (firstCutter != null) {
752                 resultBuf.append(" " + normalizeCutter(firstCutter, 6));
753 
754                 // normalize optional first cutter suffix
755                 String firstCutterSuffix = getFirstLCcutterSuffix(upcaseLCcallnum);
756                 if (firstCutterSuffix != null)
757                     resultBuf.append(" " + normalizeSuffix(firstCutterSuffix));
758 
759                 // optional second cutter - normalize
760                 String secondCutter = getSecondLCcutter(upcaseLCcallnum);
761                 if (secondCutter != null) {
762                     resultBuf.append(" " + normalizeCutter(secondCutter, 6));
763 
764                     String secondCutterSuffix = getSecondLCcutterSuffix(upcaseLCcallnum);
765                     if (secondCutterSuffix != null)
766                         resultBuf.append(" " + normalizeSuffix(secondCutterSuffix));
767                 }
768             }
769         } catch (NumberFormatException e) {
770 //              if (recid != null)
771             if ((recid != null) && (!rawLCcallnum.startsWith("XX"))) // Stanford mod
772             {
773                 if (errors == null) {
774                     System.err.println("Problem creating shelfkey for record " + recid + "; call number: " + rawLCcallnum);
775                 } else {
776                     errors.addError(ErrorHandler.ERROR_TYPO, "Problem creating shelfkey for record " + recid + "; call number: " + rawLCcallnum);
777                 }
778             }
779             //e.printStackTrace();
780             resultBuf = new StringBuilder();
781         }
782 
783         if (resultBuf.length() == 0)
784             resultBuf.append(upcaseLCcallnum);
785 
786         return resultBuf.toString().trim();
787     }
788 
789     /**
790      * normalize the cutter string for shelf list sorting - make number into
791      * decimal of the number of digits indicated by param
792      */
793     private static String normalizeCutter(String cutter, int numDigits) {
794         String result = null;
795         if (cutter != null && cutter.length() > 0) {
796             String cutLets = getLCstartLetters(cutter);
797             String cutDigs = cutter.substring(cutLets.length());
798             String norm = null;
799             if (cutDigs != null && cutDigs.length() > 0) {
800                 try {
801                     // make sure part after letters is an integer
802                     Integer.parseInt(cutDigs);
803                     norm = normalizeFloat("." + cutDigs, 1, numDigits);
804                 } catch (NumberFormatException e) {
805                     norm = cutDigs;
806                 }
807             } else if (cutDigs.length() == 0 && cutLets.length() == 1)
808                 // if no digits in cutter, want it to sort first
809                 norm = normalizeFloat("0", 1, numDigits);
810 
811             result = cutLets + norm;
812         }
813         return result;
814     }
815 
816     /**
817      * normalize a suffix for shelf list sorting by changing all digit
818      * substrings to a constant length (left padding with zeros).
819      */
820     public static String normalizeSuffix(String suffix) {
821         if (suffix != null && suffix.length() > 0) {
822             StringBuilder resultBuf = new StringBuilder(suffix.length());
823             // get digit substrings
824             String[] digitStrs = suffix.split("[\\D]+");
825             int len = digitStrs.length;
826             if (digitStrs != null && len != 0) {
827                 int s = 0;
828                 for (int d = 0; d < len; d++) {
829                     String digitStr = digitStrs[d];
830                     int ix = suffix.indexOf(digitStr, s);
831                     // add the non-digit chars before, if they exist
832                     if (s < ix) {
833                         String text = suffix.substring(s, ix);
834                         resultBuf.append(text);
835                     }
836                     if (digitStr != null && digitStr.length() != 0) {
837                         // add the normalized digit chars, if they exist
838                         resultBuf.append(normalizeFloat(digitStr, 6, 0));
839                         s = ix + digitStr.length();
840                     }
841 
842                 }
843                 // add any chars after the last digStr
844                 resultBuf.append(suffix.substring(s));
845                 return resultBuf.toString();
846             }
847         }
848 
849         return suffix;
850     }
851 
852     /**
853      * given a shelfkey (a lexicaly sortable call number), return the reverse
854      * shelf key - a sortable version of the call number that will give the
855      * reverse order (for getting "previous" call numbers in a list)
856      */
857     public static String getReverseShelfKey(String shelfkey) {
858         StringBuilder resultBuf = new StringBuilder(reverseDefault);
859         if (shelfkey != null && shelfkey.length() > 0)
860             resultBuf.replace(0, shelfkey.length(), reverseAlphanum(shelfkey));
861         return resultBuf.toString();
862     }
863 
864     /**
865      * return the reverse String value, mapping A --> 9, B --> 8, ...
866      * 9 --> A and also non-alphanum to sort properly (before or after alphanum)
867      */
868     private static String reverseAlphanum(String orig) {
869 
870 /*
871         char[] origArray = orig.toCharArray();
872 
873         char[] reverse = new char[origArray.length];
874         for (int i = 0; i < origArray.length; i++) {
875             Character ch = origArray[i];
876             if (ch != null) {
877                 if (Character.isLetterOrDigit(ch))
878                     reverse[i] = alphanumReverseMap.get(ch);
879                 else
880                     reverse[i] = reverseNonAlphanum(ch);
881             }
882         }
883 */
884         StringBuilder reverse = new StringBuilder();
885         for (int ix = 0; ix < orig.length(); ) {
886             int codePoint = Character.toUpperCase(orig.codePointAt(ix));
887             char[] chs = Character.toChars(codePoint);
888 
889             if (Character.isLetterOrDigit(codePoint)) {
890                 if (chs.length == 1) {
891                     char c = chs[0];
892                     if (alphanumReverseMap.containsKey(c))
893                         reverse.append(alphanumReverseMap.get(c));
894                     else {
895                         // not an ASCII letter or digit
896 
897                         // map latin chars with diacritic to char without
898                         char foldC;
899 
900                         if (UCharacter.UnicodeBlock.of(c) != UCharacter.UnicodeBlock.COMBINING_DIACRITICAL_MARKS &&
901                                 UCharacter.UnicodeBlock.of(c) != UCharacter.UnicodeBlock.SPACING_MODIFIER_LETTERS &&
902                                 (foldC = Utils.foldDiacriticLatinChar(c)) != 0x00)
903                             // we mapped a latin char w diacritic to plain ascii
904                             reverse.append(alphanumReverseMap.get(foldC));
905                         else
906                             // single char, but non-latin, non-digit
907                             // ... view it as after Z in regular alphabet, for now
908                             reverse.append(SORT_FIRST_CHAR);
909                     }
910                 } else {
911                     // multiple 16 bit character unicode letter
912                     // ... view it as after Z in regular alphabet, for now
913                     reverse.append(SORT_FIRST_CHAR);
914                 }
915             } else // not a letter or a digit
916                 reverse.append(reverseNonAlphanum(chs[0]));
917 
918             ix += chs.length;
919         }
920 
921         return new String(reverse);
922     }
923 
924     /**
925      * for non alpha numeric characters, return a character that will sort
926      * first or last, whichever is the opposite of the original character.
927      */
928     public static char[] reverseNonAlphanum(char ch) {
929         // use punctuation before or after alphanum as appropriate
930         switch (ch) {
931             case '.':
932                 return Character.toChars('}');
933             case '{':
934             case '|':
935             case '}':
936             case '~':
937 // N.B.:  these are tough to deal with in a variety of contexts.
938 // Hopefully diacritics and non-latin won't bite us in the butt.
939 //              return Character.toChars(Character.MIN_CODE_POINT);
940                 return Character.toChars(' ');
941             default:
942 //              return Character.toChars(Character.MAX_CODE_POINT);
943                 return Character.toChars('~');
944         }
945     }
946 
947     /**
948      * given a raw Dewey call number, return the shelf key - a sortable
949      * version of the call number
950      */
951     public static String getDeweyShelfKey(String rawDeweyCallnum) {
952         StringBuilder resultBuf = new StringBuilder();
953 
954         // class
955         // float number, normalized to have 3 leading zeros
956         //   and trailing zeros if blank doesn't sort before digits
957         String classNum = normalizeFloat(getDeweyB4Cutter(rawDeweyCallnum), 3, 8);
958         resultBuf.append(classNum);
959 
960         // cutter   1-3 digits
961         // optional cutter letters suffix
962         //   letters preceded by space or not.
963 
964         // normalize cutter  - treat number as a fraction.
965         String cutter = getDeweyCutter(rawDeweyCallnum);
966         if (cutter != null)
967             resultBuf.append(" " + cutter);
968 
969         // optional suffix (year, part, volume, edition) ...
970         String cutterSuffix = getDeweyCutterSuffix(rawDeweyCallnum);
971         if (cutterSuffix != null)
972             resultBuf.append(" " + normalizeSuffix(cutterSuffix));
973 
974 
975         if (resultBuf.length() == 0)
976             resultBuf.append(rawDeweyCallnum);
977 
978         return resultBuf.toString().trim();
979     }
980 
981 
982     /**
983      * normalizes numbers (can have decimal portion) to (digitsB4) before
984      * the decimal (adding leading zeroes as necessary) and (digitsAfter
985      * after the decimal.  In the case of a whole number, there will be no
986      * decimal point.
987      *
988      * @param floatStr,   the number, as a String
989      * @param digitsB4    - the number of characters the result should have before the
990      *                    decimal point (leading zeroes will be added as necessary). A negative
991      *                    number means leave whatever digits encountered as is; don't pad with leading zeroes.
992      * @param digitsAfter - the number of characters the result should have after
993      *                    the decimal point.  A negative number means leave whatever fraction
994      *                    encountered as is; don't pad with trailing zeroes (trailing zeroes in
995      *                    this case will be removed)
996      * @throws NumberFormatException if string can't be parsed as a number
997      */
998     public static String normalizeFloat(String floatStr, int digitsB4, int digitsAfter) {
999         double value = Double.valueOf(floatStr).doubleValue();
1000 
1001         String formatStr = getFormatString(digitsB4) + '.' + getFormatString(digitsAfter);
1002 
1003         DecimalFormat normFormat = new DecimalFormat(formatStr);
1004         String norm = normFormat.format(value);
1005         if (norm.endsWith("."))
1006             norm = norm.substring(0, norm.length() - 1);
1007         return norm;
1008     }
1009 
1010     private static String PUNCT_PREFIX = "([\\.:\\/])?";
1011     private static String NS_PREFIX = "(n\\.s\\.?\\,? ?)?";
1012     private static String MONTHS = "jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec";
1013     private static String VOL_LETTERS = "[\\:\\/]?(bd|iss|jahrg|new ser|no|part|pts?|ser|t|v|vols?|vyp" + "|" + MONTHS + ")";
1014     private static String VOL_NUMBERS = "\\d+([\\/-]\\d+)?( \\d{4}([\\/-]\\d{4})?)?( ?suppl\\.?)?";
1015     private static String VOL_NUMBERS_LOOSER = "\\d+.*";
1016     private static String VOL_NUM_AS_LETTERS = "[A-Z]([\\/-]\\[A-Z]+)?.*";
1017 
1018     private static Pattern VOL_PATTERN = Pattern.compile(PUNCT_PREFIX + NS_PREFIX + VOL_LETTERS + "\\.? ?" + VOL_NUMBERS, Pattern.CASE_INSENSITIVE);
1019     private static Pattern VOL_PATTERN_LOOSER = Pattern.compile(PUNCT_PREFIX + NS_PREFIX + VOL_LETTERS + "\\.? ?" + VOL_NUMBERS_LOOSER, Pattern.CASE_INSENSITIVE);
1020     private static Pattern VOL_PATTERN_LETTERS = Pattern.compile(PUNCT_PREFIX + NS_PREFIX + VOL_LETTERS + "[\\/\\. ]" + VOL_NUM_AS_LETTERS, Pattern.CASE_INSENSITIVE);
1021 
1022     /**
1023      * remove volume information from LC call number if it is present as a
1024      * suffix
1025      *
1026      * @param rawLCcallnum
1027      * @return call number without the volume information, or full call number
1028      *         if no volume information was present.
1029      */
1030     public static String removeLCVolSuffix(String rawLCcallnum) {
1031         // get suffix to last occurring cutter, if there is one
1032         String suffix = getSecondLCcutterSuffix(rawLCcallnum);
1033         if (suffix == null || suffix.length() == 0) {
1034             String cut1suffix = getFirstLCcutterSuffix(rawLCcallnum);
1035             if (cut1suffix != null) {
1036                 // first cutter suffix may contain second cutter
1037                 String cut2 = getSecondLCcutter(rawLCcallnum);
1038                 if (cut2 != null) {
1039                     int ix = cut1suffix.indexOf(cut2);
1040                     if (ix != -1)
1041                         suffix = cut1suffix.substring(0, ix);
1042                     else
1043                         suffix = cut1suffix;
1044                 } else
1045                     suffix = cut1suffix;
1046             }
1047         }
1048 
1049         // could put last ditch effort with tightest pattern, but don't want to take out too much
1050         if (suffix != null && suffix.length() > 0) {
1051             Matcher matcher = VOL_PATTERN.matcher(suffix);
1052             if (!matcher.find()) {
1053                 matcher = VOL_PATTERN_LOOSER.matcher(suffix);
1054                 if (!matcher.find()) {
1055                     matcher = VOL_PATTERN_LETTERS.matcher(suffix);
1056                 }
1057             }
1058 // look for first / last match, not any match (subroutine?)?
1059             if (matcher.find(0)) {
1060                 // return orig call number with matcher part lopped off.
1061                 int ix = rawLCcallnum.indexOf(suffix) + matcher.start();
1062                 if (ix != -1 && ix < rawLCcallnum.length()) {
1063                     return rawLCcallnum.substring(0, ix).trim();
1064                 }
1065             }
1066         }
1067         return rawLCcallnum;
1068     }
1069 
1070 
1071     /**
1072      * remove volume information from Dewey call number if it is present as a
1073      * suffix
1074      *
1075      * @param rawDeweyCallnum
1076      * @return call number without the volume information, or full call number
1077      *         if no volume information was present.
1078      */
1079     public static String removeDeweyVolSuffix(String rawDeweyCallnum) {
1080         String cutSuffix = getDeweyCutterSuffix(rawDeweyCallnum);
1081 
1082         if (cutSuffix == null || cutSuffix.length() == 0)
1083             return rawDeweyCallnum;
1084 
1085         Matcher matcher = VOL_PATTERN.matcher(cutSuffix);
1086         if (!matcher.find()) {
1087             matcher = VOL_PATTERN_LOOSER.matcher(cutSuffix);
1088             if (!matcher.find()) {
1089                 matcher = VOL_PATTERN_LETTERS.matcher(cutSuffix);
1090             }
1091         }
1092 
1093         if (matcher.find(0)) {
1094             // return orig call number with matcher part lopped off.
1095             int ix = rawDeweyCallnum.indexOf(cutSuffix) + matcher.start();
1096             if (ix != -1 && ix < rawDeweyCallnum.length()) {
1097                 return rawDeweyCallnum.substring(0, ix).trim();
1098             }
1099         }
1100         return rawDeweyCallnum;
1101     }
1102 
1103 
1104     /**
1105      * adds leading zeros to a dewey call number, when they're missing.
1106      *
1107      * @param deweyCallNum
1108      * @return the dewey call number with leading zeros
1109      */
1110     public static String addLeadingZeros(String deweyCallNum) {
1111         String result = deweyCallNum;
1112         String b4Cutter = getPortionBeforeCutter(deweyCallNum);
1113 
1114         // TODO: could call Utils.normalizeFloat(b4Cutter.trim(), 3, -1);
1115         // but still need to add back part after cutter
1116 
1117         String b4dec = null;
1118         int decIx = b4Cutter.indexOf(".");
1119         if (decIx >= 0)
1120             b4dec = deweyCallNum.substring(0, decIx).trim();
1121         else
1122             b4dec = b4Cutter.trim();
1123 
1124         if (b4dec != null) {
1125             switch (b4dec.length()) {
1126                 case 1:
1127                     result = "00" + deweyCallNum;
1128                     break;
1129                 case 2:
1130                     result = "0" + deweyCallNum;
1131             }
1132         }
1133 
1134         return result;
1135     }
1136 
1137     /**
1138      * return a format string corresponding to the number of digits specified
1139      *
1140      * @param numDigits - the number of characters the result should have (to be padded
1141      *                  with zeroes as necessary). A negative number means leave whatever digits
1142      *                  encountered as is; don't pad with zeroes -- up to 12 characters.
1143      */
1144     private static String getFormatString(int numDigits) {
1145         StringBuilder b4 = new StringBuilder();
1146         if (numDigits < 0)
1147             b4.append("############");
1148         else if (numDigits > 0) {
1149             for (int i = 0; i < numDigits; i++) {
1150                 b4.append('0');
1151             }
1152         }
1153         return b4.toString();
1154     }
1155 
1156     /**
1157      * @param callNumber
1158      * @return
1159      */
1160     public static String getSuDocShelfKey(String callNumber) {
1161         String upcaseSuDoccallnum = callNumber.toUpperCase();
1162         StringBuffer shelfKey = new StringBuffer();
1163         //split the call number based on numbers and alphabets
1164         String[] cNumSub = upcaseSuDoccallnum.split(SUDOC_REGEX);
1165         for (String str : cNumSub) {
1166             if (StringUtils.isNumeric(str)) {   // numbers
1167                 // append zeros to sort Ordinal
1168                 str = StringUtils.leftPad(str, 5, "0"); // constant length 5
1169                 shelfKey.append(str);
1170                 shelfKey.append(" ");
1171             } else {                     // alphabets
1172                 // append spaces to sort Lexicographic
1173                 str = StringUtils.rightPad(str, 5);  // constant length 5
1174                 shelfKey.append(str);
1175                 shelfKey.append(" ");
1176             }
1177         }
1178         return shelfKey.toString().trim();
1179     }
1180 }