/Files/rocketfan/Language_Independent_Extractive_Summarization.pdf /Files/rocketfan/Summarization_readme.pdf Well,当前仅仅做到将文本中的句子按重要程度排序,可进一步操作生成摘要,仅仅用于英文文本。 仅以该随笔纪念我的第一个JAVA程序:)
//usage //javac TestSummarizer.java //java TestSummarizer <inputFile> <outputFile> #output file 中将会对句子按重要程度排好序
1 /**
2 * The class for getting user command and 3 * parse the command to set up aprropritae 4 * Sumarizer who will finish the job of ranking the 5 * importance of all the input file sentences , 6 * sort sentences based on that and then 7 * write to the output file. 8 */ 9 public class TestSummarizer { 10 11 public static void main(String[] args) { 12 13 String rankMethod = args[0]; 14 String inputFile = args[1]; 15 String outputFile = args[2]; 16 //TODO warning not supported method 17 Ranker ranker; 18 if (rankMethod.equals("-pagerank")) { 19 ranker = new PageRankRanker(); 20 } else if (rankMethod.equals("-hits")) { 21 ranker = new HitsRanker(); 22 } else { 23 System.out.println("Not -pagerank or -hits use -pagerank"); 24 ranker = new PageRankRanker(); 25 } 26 Summarizer summarizer = new Summarizer(inputFile, outputFile, ranker); 27 summarizer.fileSummarize(); 28 } 29 } //end of class TestSumarizer
//Summarizer.java是核心部分,通过建立inverted word index 计算出所有句子对的overlap
1 //import java.util.regex.*;
2 3 import java.util.*; 4 5 import java.io.*; 6 7 import java.text.*; 8 9 //to do how to split also consider ? 10 11 12 13 /** 14 15 * Summarizer is on charge of caculating 16 17 * a Matrix A, with A[i][j] represents 18 19 * the overlap of sentence i and j. 20 21 * It will than send the Matrix A to a 22 23 * Ranker who will caculate all the sentence's 24 25 * importance value and send back to Summarizer. 26 27 * Summarizer will sort sentences based on 28 29 * their importance value and write the reult 30 31 * to file. 32 33 * 34 35 * Summarizer is also on charge of spliting doucument 36 37 * content to sentences and split each sentence to 38 39 * words. 40 41 */ 42 43 public class Summarizer { 44 45 46 47 /** 48 49 * Define Sentence type, it will record the sentence's 50 51 * start and end postion in fileContentString. 52 53 * Also will record the word number in this sentence. 54 55 */ 56 57 private class Sentence { 58 59 public int start; 60 61 public int end; 62 63 public int wordNum; 64 65 } 66 67 68 69 /** Store which rank aloritm to use*/ 70 71 private final Ranker ranker; 72 73 /** Store the name of output file*/ 74 75 private final String outputFileName; 76 77 /** The whole file content is stored as a string here*/ 78 79 private final String fileContentString; 80 81 /** sotre all the sentences info*/ 82 83 private final List<Sentence> sentenceList = new ArrayList<Sentence> ();; 84 85 /** the dfault stop words we will use*/ 86 87 private final static String stopWords[] = 88 89 {"a", "about", "an", "and", "are", "as", "at", "be", "but", "by", 90 91 "for", "has", "have", "he", "his", "her", "in", "is", "it", "its", 92 93 "of", "on", "or", "she", "that", "the", "their", "they", "this", 94 95 "to", "was", "which", "who", "will", "with", "you"}; 96 97 private final Map<String, String> hmStopWord = new HashMap<String, String> (); 98 99 100 101 /** 102 103 * Construct of a summarizer 104 105 * Client must provide the inputFileName to sumarize 106 107 * and the aimed outputFileName to output, 108 109 * Clent also has to choose wich ranker algorithm to use,eg. 110 111 * Summarizer will also get stop words hashing map table set up. 112 113 */ 114 115 public Summarizer(String inputFileName, String outputFileName, Ranker ranker) { 116 117 this.outputFileName = outputFileName; 118 119 this.fileContentString = readFileToString(inputFileName); 120 121 this.ranker = ranker; 122 123 initStopWordsMap(); 124 125 } 126 127 /** 128 129 * The only interface user can see currently, 130 131 * to sumarize a given file using ranker provided rank algorithm 132 133 */ 134 135 public void fileSummarize() { 136 137 initSentenceList(); //first spilit doucument to sentences using BreakIterator 138 139 double[][] A = getSetencesWeightMatrix(); //caculate the wight matrix using overlap score of sentences pair 140 141 double[]P = ranker.getRankVec(A); //caculate each sentence importance value 142 143 sortAndWriteFile(P); //sort and ouput the sorted sentences 144 145 } 146 147 148 149 //below are all internal private functions 150 151 152 153 /** 154 155 * Caculate the total sentece num and store each sentence 156 157 * info to sentencesList(start, end) 158 159 */ 160 161 private void initSentenceList() { 162 163 164 165 Locale currentLocale = new Locale("en", "US"); 166 167 BreakIterator sentenceIter = BreakIterator.getSentenceInstance(currentLocale); 168 169 sentenceIter.setText(fileContentString); 170 171 172 173 int start = sentenceIter.first(); 174 175 int end = sentenceIter.next(); 176 177 while (end != sentenceIter.DONE) { 178 179 Sentence sentence = new Sentence(); 180 181 sentence.start = start; 182 183 if (fileContentString.charAt(end - 1) == '\n') //we will eliminate the \n if a sentece has 184 185 sentence.end = end - 1; //[) 186 187 else 188 189 sentence.end = end; 190 191 sentenceList.add(sentence); 192 193 start = end; 194 195 end = sentenceIter.next(); 196 197 } 198 199 } 200 201 202 203 /** 204 205 * Init of stop words hash map 206 207 */ 208 209 private void initStopWordsMap() { 210 211 for (int i = 0; i < stopWords.length; i++) 212 213 hmStopWord.put(stopWords[i], null); 214 215 } 216 217 218 219 /** 220 221 * This is the key function of Summarizer 222 223 * It will caculate the overlap value 224 225 * for each pair of sentences 226 227 */ 228 229 private double[][] getSetencesWeightMatrix( ) { 230 231 232 233 Map<String, List<Integer> > hm = new HashMap<String, List<Integer> >(); 234 235 int sentenceNum = sentenceList.size(); 236 237 double[][] A = new double[sentenceNum][sentenceNum]; 238 239 //deal with all the words in each sentence 240 241 Locale currentLocale = new Locale("en", "US"); 242 243 BreakIterator wordIter = BreakIterator.getWordInstance(currentLocale); 244 245 246 247 //we do the process twice one forward and one reverse 248 249 //only for the case of repated words in one sentence 250 251 //we suppose overlap(i, j) = overlap(j, i) 252 253 //if setentce i is "above above ok" sentence j is "above above wrong" 254 255 //the overlap is 2 , if sentence j is "above wrong" the overlap is 1. 256 257 for (int i = 0; i < sentenceNum; i++) { 258 259 Sentence sentence = sentenceList.get(i); 260 261 wordIter.setText(fileContentString.substring(sentence.start, sentence.end)); 262 263 264 265 int start = wordIter.first(); 266 267 int end = wordIter.next(); 268 269 int wordNum = 0; 270 271 while (end != wordIter.DONE) { 272 273 String word = fileContentString.substring(sentence.start + start, sentence.start + end); 274 275 if (Character.isLetterOrDigit(word.charAt(0))) { 276 277 wordNum++; //ok, find legal word 278 279 280 281 String token = parseString(word); 282 283 if (hmStopWord.containsKey(token)) { 284 285 start = end; 286 287 end = wordIter.next(); 288 289 continue; 290 291 } 292 293 List<Integer> list; 294 295 if (hm.containsKey(token)) { 296 297 list = hm.get(token); 298 299 int flag = 0; 300 301 for (int j : list) { 302 303 if (i > j) 304 305 A[i][j] += 1.0; 306 307 else 308 309 flag = 1; 310 311 } 312 313 if (flag == 0) 314 315 list.add(i); 316 317 } else { 318 319 list = new LinkedList<Integer> (); 320 321 list.add(i); 322 323 hm.put(token, list); 324 325 } 326 327 } 328 329 start = end; 330 331 end = wordIter.next(); 332 333 } 334 335 sentence.wordNum = wordNum; 336 337 } 338 339 340 341 hm.clear(); 342 343 344 345 for (int i = sentenceNum - 1; i >= 0; i--) { 346 347 Sentence sentence = sentenceList.get(i); 348 349 wordIter.setText(fileContentString.substring(sentence.start, sentence.end)); 350 351 352 353 int start = wordIter.first(); 354 355 int end = wordIter.next(); 356 357 while (end != wordIter.DONE) { 358 359 String word = fileContentString.substring(sentence.start + start, sentence.start + end); 360 361 if (Character.isLetterOrDigit(word.charAt(0))) { 362 363 //ok,find a legal word 364 365 String token = parseString(word); 366 367 if (hmStopWord.containsKey(token)) { 368 369 start = end; 370 371 end = wordIter.next(); 372 373 continue; 374 375 } 376 377 List<Integer> list; 378 379 if (hm.containsKey(token)) { 380 381 list = hm.get(token); 382 383 int flag = 0; 384 385 for (int j : list) { 386 387 if (i < j) 388 389 A[i][j] += 1.0; 390 391 else 392 393 flag = 1; 394 395 } 396 397 if (flag == 0) 398 399 list.add(i); 400 401 402 403 } else { 404 405 list = new LinkedList<Integer> (); 406 407 list.add(i); 408 409 hm.put(token, list); 410 411 } 412 413 } 414 415 start = end; 416 417 end = wordIter.next(); 418 419 } 420 421 } 422 423 424 425 //make sure overlap(i, j) = overlap(j, i) when not considering length 426 427 for (int i = 0; i < sentenceNum; i++) { 428 429 for (int j = 0; j > i && j < sentenceNum; j++) { 430 431 if (A[i][j] > A[j][i]) 432 433 A[i][j] = A[j][i]; 434 435 } 436 437 } 438 439 440 441 //divide by sentence lenght to avoid promoting long sentences 442 443 for (int i = 0; i < sentenceNum; i++) { 444 445 for (int j = 0; j < sentenceNum; j++) { 446 447 if (A[i][j] > 0 && sentenceList.get(i).wordNum > 0) 448 449 A[i][j] /= sentenceList.get(i).wordNum; 450 451 } 452 453 } 454 455 456 457 return A; 458 459 } 460 461 462 463 /** Read the whole input file content*/ 464 465 private static String readFileToString(String infile) { 466 467 try { 468 469 BufferedReader reader = new BufferedReader(new FileReader(infile)); 470 471 String result =""; 472 473 String line = reader.readLine(); 474 475 while (line != null) { 476 477 result = result + line + '\n'; 478 479 line = reader.readLine(); 480 481 } 482 483 reader.close(); 484 485 return result; 486 487 } catch(IOException ex) { 488 489 System.out.println(ex); //handle file not found by displaying message 490 491 return ""; //return the empty string if file not fount 492 493 } 494 495 } 496 497 498 499 /** 500 501 * For each word we will consider it as lowercase 502 503 * and delete any non letter character. 504 505 */ 506 507 private static String parseString(String s) { 508 509 return s.toLowerCase().replaceAll("\\W", ""); 510 511 } 512 513 514 515 /** 516 517 * Print the sentence which has index i 518 519 */ 520 521 private String getSentenceString(int i) { 522 523 Sentence sentence = sentenceList.get(i); 524 525 return fileContentString.substring(sentence.start, sentence.end); 526 527 } 528 529 530 531 /** 532 533 * After running rank algorithm, sort the result and output the sentences based 534 535 * on their score 536 537 */ 538 539 class CollatorComparator implements Comparator<Double> { 540 541 public int compare(Double element1, Double element2) { 542 543 double key1 = element1.doubleValue(); 544 545 double key2 = element2.doubleValue(); 546 547 if (key1 == key2) 548 549 return 1; 550 551 if (key1 > key2) 552 553 return -1; 554 555 else 556 557 return 1; 558 559 } 560 561 } 562 563 564 565 private void sortAndWriteFile(double[] P) { 566 567 try { 568 569 CollatorComparator comparator = new CollatorComparator(); 570 571 Map<Double,Integer> map = new TreeMap<Double, Integer>(comparator); 572 573 574 575 for (int i = 0; i < P.length; i++) 576 577 map.put(P[i], i); 578 579 580 581 Set entries = map.entrySet();//得到关键字/值对的集合 582 583 Iterator iter = entries.iterator(); 584 585 PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter(outputFileName))); 586 587 DecimalFormat df = new DecimalFormat("#0.00000"); 588 589 while(iter.hasNext()) { 590 591 Map.Entry entry = (Map.Entry)iter.next(); 592 593 double keyWeight = (Double)entry.getKey(); 594 595 int valueIndex = (Integer)entry.getValue(); 596 597 writer.write("("); 598 599 writer.write(df.format(keyWeight)); 600 601 writer.write(") "); 602 603 writer.write(getSentenceString(valueIndex) + "\n"); 604 605 } 606 607 writer.close(); 608 609 } catch (IOException ex) { 610 611 System.out.println(ex); //handle file not found by displaying message 612 613 } 614 615 } 616 617 618 } // end of class Sumarizer 619
//MatrixHelp.java 内部使用的一些矩阵操作相关函数的实现
1 /**
2 * This class has all the independent caculations related to matrix and vector 3 * that will be used in text sumarization. 4 * Notice those fuctions for this paticular project useage, does not consider 5 * more about reuse, the data type all use double and assume two dimessional 6 * matrix with same dinmenstion length(square matrix). 7 */ 8 public class MatrixHelp { 9 10 /** retrun squared distance of vector P1 and P2*/ 11 public static double distance(double[] P1, double[] P2) { 12 int len = P1.length; //P1.length must equal to P2.length 13 double sum = 0; 14 for (int i = 0; i < len; i++) 15 sum += (P1[i] - P2[i]) * (P1[i] - P2[i]); 16 return sum; 17 } 18 19 /** say H = transpose([a,b,c]) normal H means let H = transpose([a,b,c]/sqrt(a^2+b^2+c^2))*/ 20 public static void normal(double[] P) { 21 int len = P.length; 22 double sum = 0; 23 for (int i = 0; i < len; i++) 24 sum += P[i]; 25 for (int i = 0; i < len; i++) 26 P[i] /= sum; 27 } 28 29 /** return value of ||P||*/ 30 public static double normValue(double[] P) { 31 double sum = 0; 32 for (int i = 0; i < P.length; i++) 33 sum += Math.abs(P[i]); 34 return sum; 35 } 36 37 /** sub return P1 - P2*/ 38 public static double[] subVec(double[] P1, double[] P2) { 39 int len = P1.length; 40 double[] Result = new double[len]; 41 for (int i = 0; i < len; i++) 42 Result[i] = P1[i] - P2[i]; 43 return Result; 44 } 45 46 /** merge return (P1 + P2)/2.0*/ 47 public static double[] mergeVec(double[] P1, double[] P2) { 48 int len = P1.length; 49 double[] Result = new double[len]; 50 for (int i = 0; i < len; i++) 51 Result[i] = (P1[i] + P2[i])/2.0; 52 return Result; 53 } 54 55 56 /** normalize all the value in vec P to 0.001-0.0.9995*/ 57 public static void normalize(double[] P) { 58 double dMaxValue = -1.0; 59 double dMinValue = 2.0; 60 int len = P.length; 61 for (int i = 0; i < len; i++) { 62 if (P[i] < dMinValue) 63 dMinValue = P[i]; 64 if (P[i] > dMaxValue) 65 dMaxValue = P[i]; 66 } 67 if (dMaxValue - dMinValue < 0.0000001) 68 return; 69 for (int i = 0; i < P.length; i++) { 70 if(P[i] <= (dMinValue*1.005)) //do not use boundary value 71 P[i]= dMinValue*1.005; 72 if(P[i] >= (dMaxValue*0.995)) 73 P[i] = dMaxValue*0.995; 74 P[i]=(P[i]-dMinValue)/(dMaxValue-dMinValue); 75 P[i] = ((0.5-0.001)/0.5)*P[i] + 0.001; 76 } 77 } 78 /** return the result matrix of matrix A mutliply (transpose of A)*/ 79 public static double[][] matrixMultiply(double[][] A) { 80 int len = A[0].length; 81 double[][] Result = new double[len][len]; 82 for (int i = 0; i < len; i++) 83 for (int j = 0; j < len; j++) 84 for (int k = 0; k < len; k++) 85 Result[i][j] += A[i][k] * A[j][k]; //A[i][k] * AT[k][j] 86 return Result; 87 } 88 89 /** return the result matrix of matrix A mutliply matrix B,A and B of same dimention size*/ 90 public static double[][] matrixMultiply(double[][] A, double[][] B) { 91 int len = A[0].length; 92 double[][] Result = new double[len][len]; 93 for (int i = 0; i < len; i++) 94 for (int j = 0; j < len; j++) 95 for (int k = 0; j < len; k++) 96 Result[i][j] += A[i][k] * B[k][j]; 97 return Result; 98 } 99 /** return the result vector of matrix A mutliply vector P*/ 100 public static double[] matrixMultiplyVector(double[][] A, double[] P) { 101 int len = P.length; 102 double[] Result = new double[len]; 103 for (int i = 0; i < len; i++) 104 for (int j = 0; j < len; j++) 105 Result[i] += A[i][j] * P[j]; 106 return Result; 107 } 108 109 110 /** return the result vector of first transposing matrix A than mutliply vector P*/ 111 public static double[] matrixTransposeMultiplyVector(double[][] A, double[] P) { 112 int len = P.length; 113 double[] Result = new double[len]; 114 for (int i = 0; i < len; i++) 115 for (int j = 0; j < len; j++) { 116 Result[i] += A[j][i] * P[j]; 117 } 118 119 return Result; 120 } 121 122 /** for debug show all matrix data value*/ 123 public static void printMatrix(double[][] A) { 124 int len = A[0].length; 125 for (int i = 0; i < len; i++) { 126 for (int j = 0; j < len; j++) { 127 System.out.printf("%.2f", A[i][j]); 128 System.out.print(" "); 129 } 130 System.out.println(""); 131 } 132 } 133 134 /** for debug show all vector data value*/ 135 public static void printVector(double[] P) { 136 for (int i = 0; i < P.length; i++) 137 System.out.printf("%.2f", P[i]); 138 System.out.println("\n"); 139 } 140 } 141
//Ranker.java
1 /**
2 * Ranker an absturct class for calculating rank score of all sentences 3 * Right now there are two implementations 4 * PageRankRanker using pagerank algorithom and HitsRanker using hits 5 * algorithom. 6 */ 7 public interface Ranker { 8 /** 9 * Given the Weight matrix A 10 * return the vec needed after doing iterate algorithm,eg pagerank, hits. 11 */ 12 public double[] getRankVec(double[][] A); 13 }
//PageRankRanker.java //pager rank 算法的实现
1 /**
2 * The implementaion of Ranker based on pagerank algorithm. 3 */ 4 public class PageRankRanker implements Ranker { 5 6 public double[] getRankVec(double[][] A) { 7 return pageRankIterate(getInitMatrix(A), 0.85); 8 } 9 10 11 /** 12 * Kernal of page rank iterate 13 * Taking A as input matrixand d(damping factor,use 0.85 will be ok) 14 * and return the result vector with weight value 15 * for each sentence 16 */ 17 public static double[] pageRankIterate(double[][] A, double d) { 18 int len = A[0].length; 19 double[][] P= new double[2][len]; 20 int now = 0; 21 int other = 1; 22 int temp; 23 for (int i = 0; i < len; i++) 24 P[now][i] = 1.0/len; 25 26 double threshould = 0.0000001; //spell wrong? 27 do { 28 P[other] = MatrixHelp.matrixMultiplyVector(A, P[now]); 29 for (int i = 0; i < len; i++) 30 //P[other][i] = d * P[other][i] + (1 - d); 31 P[other][i] = d * P[other][i] + (1 - d); 32 temp = now; 33 now = other; 34 other = temp; 35 } while(MatrixHelp.distance(P[now], P[other]) > threshould); 36 37 MatrixHelp.normalize(P[now]); 38 return P[now]; 39 } 40 41 /** 42 * Get the init matrix for ranker to use based 43 * on input weight matrix A. 44 */ 45 private static double[][] getInitMatrix(double[][] A) { 46 int len = A[0].length; 47 double[][] B = new double[len][len]; 48 //for page rank init matrix 49 double weightSum; 50 for (int j = 0; j < len; j++) { 51 weightSum = 0.0; 52 for(int k = 0; k < len; k++) { 53 //weightSum += A[k][j]; 54 weightSum += A[j][k]; 55 } 56 for (int i = 0; i < len; i++) { 57 if (weightSum > 0.0000001) { 58 B[i][j] = A[i][j] / weightSum; 59 //B[i][j] = A[j][i] /weightSum; 60 } else if (i == j) { 61 B[i][j] = 0.0; 62 } else { 63 B[i][j] = 1.00/(len - 1); 64 } 65 } 66 } 67 return B; 68 } 69 70 } 71
//HitsRanker.java //hits算法的实现
1 /**
2 * The implementation of Ranker based on hits algorithm. 3 */ 4 public class HitsRanker implements Ranker { 5 6 public double[] getRankVec(double[][] A) { 7 return hitsIterate(A); 8 } 9 10 /** kernal of hits alogrithm*/ 11 public static double[] hitsIterate(double[][] B) { 12 int len = B[0].length; 13 double[][] L = MatrixHelp.matrixMultiply(B); //B*BT 14 15 double[][] H = new double[2][len]; //hub 16 double[][] A = new double[2][len]; //authority 17 int now = 0; 18 int other = 1; 19 int temp; 20 21 for (int i = 0; i < len; i++) { 22 H[now][i] = A[now][i] = 1.0; 23 } 24 double threshould = 0.0000001; 25 do { 26 A[other] = MatrixHelp.matrixTransposeMultiplyVector(L, A[now]); 27 H[other] = MatrixHelp.matrixMultiplyVector(L, H[now]); 28 MatrixHelp.normal(A[other]); 29 MatrixHelp.normal(H[other]); 30 //swap now and other 31 temp = now; 32 now = other; 33 other = temp; 34 }while(MatrixHelp.normValue(MatrixHelp.subVec(H[now], H[other])) > threshould && 35 MatrixHelp.normValue((MatrixHelp.subVec(A[now], A[other])))> threshould); 36 37 double[] R = MatrixHelp.mergeVec(A[now], H[now]); 38 MatrixHelp.normalize(R); 39 return R; 40 } 41 } 42
|
|