利用page rank,hits算法实现的一个简单的文本摘要系统

dinghj 2016-06-18

展开全文

/Files/rocketfan/Language_Independent_Extractive_Summarization.pdf

/Files/rocketfan/Summarization_readme.pdf

Well,当前仅仅做到将文本中的句子按重要程度排序，可进一步操作生成摘要，仅仅用于英文文本。

仅以该随笔纪念我的第一个JAVA程序:)

//usage

//javac TestSummarizer.java

//java TestSummarizer <inputFile> <outputFile> #output file 中将会对句子按重要程度排好序

1 /**
2  * The class for getting user command and
3  * parse the command to set up aprropritae
4  * Sumarizer who will finish the job of ranking the
5  * importance of all the input file sentences ,
6  * sort sentences based on that and then
7  * write to the output file.
8  */
9 public class TestSummarizer {
10
11     public static void main(String[] args) {
12
13         String rankMethod = args[0];
14         String inputFile = args[1];
15         String outputFile = args[2];
16         //TODO warning not supported method
17         Ranker ranker;
18         if (rankMethod.equals("-pagerank")) {
19             ranker = new PageRankRanker();
20         } else if (rankMethod.equals("-hits")) {
21             ranker = new HitsRanker();
22         } else {
23             System.out.println("Not -pagerank or -hits use -pagerank");
24             ranker = new PageRankRanker();
25         }
26         Summarizer summarizer = new Summarizer(inputFile, outputFile, ranker);
27         summarizer.fileSummarize();
28     }
29 } //end of class TestSumarizer

//Summarizer.java是核心部分，通过建立inverted word index 计算出所有句子对的overlap

  1 //import java.util.regex.*;
  2
  3 import java.util.*;
  4
  5 import java.io.*;
  6
  7 import java.text.*;
  8
  9 //to do how to split also consider ?
10
11
12
13 /**
14
15  * Summarizer is on charge of caculating
16
17  * a Matrix A, with A[i][j] represents
18
19  * the overlap of sentence i and j.
20
21  * It will than send the Matrix A to a
22
23  * Ranker who will caculate all the sentence's
24
25  * importance value and send back to Summarizer.
26
27  * Summarizer will sort sentences based on
28
29  * their importance value and write the reult
30
31  * to file.
32
33  *
34
35  * Summarizer is also on charge of spliting doucument
36
37  * content to sentences and split each sentence to
38
39  * words.
40
41  */
42
43 public class Summarizer {
44
45
46
47     /**
48
49      * Define Sentence type, it will record the sentence's
50
51      * start and end postion in fileContentString.
52
53      * Also will record the word number in this sentence.
54
55      */
56
57     private class Sentence {
58
59         public int start;
60
61         public int end;
62
63         public int wordNum;
64
65     }
66
67
68
69     /** Store which rank aloritm to use*/
70
71     private final Ranker ranker;
72
73     /** Store the name of output file*/
74
75     private final String outputFileName;
76
77     /** The whole file content is stored as a string here*/
78
79     private final String fileContentString;
80
81     /** sotre all the sentences info*/
82
83     private final List<Sentence> sentenceList =  new ArrayList<Sentence> ();;
84
85     /** the dfault stop words we will use*/
86
87     private final static String stopWords[] =
88
89         {"a", "about", "an", "and", "are", "as", "at", "be", "but", "by",
90
91          "for", "has", "have", "he", "his", "her", "in", "is", "it", "its",
92
93          "of", "on", "or", "she", "that", "the", "their", "they", "this",
94
95          "to", "was", "which", "who", "will", "with", "you"};
96
97     private final Map<String, String> hmStopWord = new HashMap<String, String> ();
98
99
100
101    /**
102
103     * Construct of a summarizer
104
105     * Client must provide the inputFileName to sumarize
106
107     * and the aimed outputFileName to output,
108
109     * Clent also has to choose wich ranker algorithm to use,eg.
110
111     * Summarizer will also get stop words hashing map table set up.
112
113     */
114
115     public Summarizer(String inputFileName, String outputFileName, Ranker ranker) {
116
117         this.outputFileName = outputFileName;
118
119         this.fileContentString = readFileToString(inputFileName);
120
121         this.ranker = ranker;
122
123         initStopWordsMap();
124
125     }
126
127     /**
128
129     * The only interface user can see currently,
130
131     * to sumarize a given file using ranker provided rank algorithm
132
133     */
134
135     public void fileSummarize() {
136
137         initSentenceList();  //first spilit doucument to sentences using BreakIterator
138
139         double[][] A = getSetencesWeightMatrix(); //caculate the wight matrix using overlap score of sentences pair
140
141         double[]P = ranker.getRankVec(A); //caculate each sentence importance value
142
143         sortAndWriteFile(P); //sort and ouput the sorted sentences
144
145     }
146
147
148
149     //below are all internal private functions
150
151
152
153     /**
154
155      * Caculate the total sentece num and store each sentence
156
157      * info to sentencesList(start, end)
158
159      */
160
161     private void initSentenceList() {
162
163
164
165         Locale currentLocale = new Locale("en", "US");
166
167         BreakIterator sentenceIter = BreakIterator.getSentenceInstance(currentLocale);
168
169         sentenceIter.setText(fileContentString);
170
171
172
173         int start = sentenceIter.first();
174
175         int end = sentenceIter.next();
176
177         while (end != sentenceIter.DONE) {
178
179             Sentence sentence = new Sentence();
180
181             sentence.start = start;
182
183             if (fileContentString.charAt(end - 1) == '\n') //we will eliminate the \n if a sentece has
184
185                 sentence.end = end - 1;     //[)
186
187             else
188
189                 sentence.end = end;
190
191             sentenceList.add(sentence);
192
193             start = end;
194
195             end = sentenceIter.next();
196
197         }
198
199     }
200
201
202
203     /**
204
205      * Init of stop words hash map
206
207      */
208
209     private void initStopWordsMap() {
210
211         for (int i = 0; i < stopWords.length; i++)
212
213             hmStopWord.put(stopWords[i], null);
214
215     }
216
217
218
219     /**
220
221      * This is the key function of Summarizer
222
223      * It will caculate the overlap value
224
225      * for each pair of sentences
226
227      */
228
229     private double[][] getSetencesWeightMatrix( ) {
230
231
232
233         Map<String, List<Integer> > hm = new HashMap<String, List<Integer> >();
234
235         int sentenceNum = sentenceList.size();
236
237         double[][] A = new double[sentenceNum][sentenceNum];
238
239         //deal with all the words in each sentence
240
241         Locale currentLocale = new Locale("en", "US");
242
243         BreakIterator wordIter = BreakIterator.getWordInstance(currentLocale);
244
245
246
247         //we do the process twice one forward and one reverse
248
249         //only for the case of repated words in one sentence
250
251         //we suppose overlap(i, j) = overlap(j, i)
252
253         //if setentce i is "above above ok" sentence j is "above above wrong"
254
255         //the overlap is 2 , if sentence j is "above wrong" the overlap is 1.
256
257         for (int i = 0; i < sentenceNum; i++) {
258
259             Sentence sentence = sentenceList.get(i);
260
261             wordIter.setText(fileContentString.substring(sentence.start, sentence.end));
262
263
264
265             int start = wordIter.first();
266
267             int end = wordIter.next();
268
269             int wordNum = 0;
270
271             while (end != wordIter.DONE) {
272
273                 String word = fileContentString.substring(sentence.start + start, sentence.start + end);
274
275                 if (Character.isLetterOrDigit(word.charAt(0))) {
276
277                     wordNum++; //ok, find legal word
278
279
280
281                     String token = parseString(word);
282
283                     if (hmStopWord.containsKey(token)) {
284
285                         start = end;
286
287                         end = wordIter.next();
288
289                         continue;
290
291                     }
292
293                     List<Integer> list;
294
295                     if (hm.containsKey(token)) {
296
297                         list = hm.get(token);
298
299                         int flag = 0;
300
301                         for (int j : list) {
302
303                             if (i > j)
304
305                                 A[i][j] += 1.0;
306
307                             else
308
309                                 flag = 1;
310
311                         }
312
313                         if (flag == 0)
314
315                             list.add(i);
316
317                     } else {
318
319                         list = new LinkedList<Integer> ();
320
321                         list.add(i);
322
323                         hm.put(token, list);
324
325                     }
326
327                 }
328
329                 start = end;
330
331                 end = wordIter.next();
332
333             }
334
335             sentence.wordNum = wordNum;
336
337         }
338
339
340
341          hm.clear();
342
343
344
345         for (int i = sentenceNum - 1; i >= 0; i--) {
346
347             Sentence sentence = sentenceList.get(i);
348
349             wordIter.setText(fileContentString.substring(sentence.start, sentence.end));
350
351
352
353             int start = wordIter.first();
354
355             int end = wordIter.next();
356
357             while (end != wordIter.DONE) {
358
359                 String word = fileContentString.substring(sentence.start + start, sentence.start + end);
360
361                 if (Character.isLetterOrDigit(word.charAt(0))) {
362
363                     //ok,find a legal word
364
365                     String token = parseString(word);
366
367                     if (hmStopWord.containsKey(token)) {
368
369                         start = end;
370
371                         end = wordIter.next();
372
373                         continue;
374
375                     }
376
377                     List<Integer> list;
378
379                     if (hm.containsKey(token)) {
380
381                         list = hm.get(token);
382
383                         int flag = 0;
384
385                         for (int j : list) {
386
387                             if (i < j)
388
389                                 A[i][j] += 1.0;
390
391                             else
392
393                                 flag = 1;
394
395                         }
396
397                         if (flag == 0)
398
399                             list.add(i);
400
401
402
403                     } else {
404
405                         list = new LinkedList<Integer> ();
406
407                         list.add(i);
408
409                         hm.put(token, list);
410
411                     }
412
413                 }
414
415                 start = end;
416
417                 end = wordIter.next();
418
419             }
420
421         }
422
423
424
425         //make sure overlap(i, j) = overlap(j, i) when not considering length
426
427         for (int i = 0; i < sentenceNum; i++) {
428
429             for (int j = 0; j > i && j < sentenceNum; j++) {
430
431                 if (A[i][j] > A[j][i])
432
433                     A[i][j] = A[j][i];
434
435             }
436
437         }
438
439
440
441         //divide by sentence lenght to avoid promoting long sentences
442
443         for (int i = 0; i < sentenceNum; i++) {
444
445             for (int j = 0; j < sentenceNum; j++) {
446
447                 if (A[i][j] > 0 && sentenceList.get(i).wordNum > 0)
448
449                     A[i][j] /= sentenceList.get(i).wordNum;
450
451             }
452
453         }
454
455
456
457         return A;
458
459     }
460
461
462
463     /** Read the whole input file content*/
464
465     private static String readFileToString(String infile) {
466
467         try {
468
469             BufferedReader reader = new BufferedReader(new FileReader(infile));
470
471             String result ="";
472
473             String line = reader.readLine();
474
475             while (line != null) {
476
477                 result = result + line + '\n';
478
479                 line = reader.readLine();
480
481             }
482
483             reader.close();
484
485             return result;
486
487         } catch(IOException ex) {
488
489             System.out.println(ex); //handle file not found by displaying message
490
491             return ""; //return the empty string if file not fount
492
493         }
494
495     }
496
497
498
499     /**
500
501      * For each word we will consider it as lowercase
502
503      * and delete any non letter character.
504
505      */
506
507     private static String parseString(String s) {
508
509         return s.toLowerCase().replaceAll("\\W", "");
510
511     }
512
513
514
515     /**
516
517      * Print the sentence which has index i
518
519      */
520
521     private String getSentenceString(int i) {
522
523         Sentence sentence = sentenceList.get(i);
524
525         return fileContentString.substring(sentence.start, sentence.end);
526
527     }
528
529
530
531     /**
532
533      * After running rank algorithm, sort the result and output the sentences based
534
535      * on their score
536
537      */
538
539     class CollatorComparator implements Comparator<Double> {
540
541         public int compare(Double element1, Double element2) {
542
543             double key1 = element1.doubleValue();
544
545             double key2 = element2.doubleValue();
546
547             if (key1 == key2)
548
549                 return 1;
550
551             if (key1 > key2)
552
553                 return -1;
554
555             else
556
557                 return 1;
558
559         }
560
561     }
562
563
564
565     private void sortAndWriteFile(double[] P) {
566
567         try {
568
569             CollatorComparator comparator = new CollatorComparator();
570
571             Map<Double,Integer> map = new TreeMap<Double, Integer>(comparator);
572
573
574
575             for (int i = 0; i < P.length; i++)
576
577                 map.put(P[i], i);
578
579
580
581             Set entries = map.entrySet();//得到关键字/值对的集合
582
583             Iterator iter = entries.iterator();
584
585             PrintWriter writer = new PrintWriter(new BufferedWriter(new FileWriter(outputFileName)));
586
587             DecimalFormat df = new DecimalFormat("#0.00000");
588
589             while(iter.hasNext()) {
590
591                 Map.Entry entry = (Map.Entry)iter.next();
592
593                 double keyWeight = (Double)entry.getKey();
594
595                 int valueIndex = (Integer)entry.getValue();
596
597                 writer.write("(");
598
599                 writer.write(df.format(keyWeight));
600
601                 writer.write(") ");
602
603                 writer.write(getSentenceString(valueIndex) + "\n");
604
605             }
606
607             writer.close();
608
609         } catch (IOException ex) {
610
611             System.out.println(ex); //handle file not found by displaying message
612
613         }
614
615     }
616
617
618 } // end of class Sumarizer
619

//MatrixHelp.java 内部使用的一些矩阵操作相关函数的实现

  1 /**
  2  * This class has all the independent caculations related to matrix and vector
  3  * that will be used in text sumarization.
  4  * Notice those fuctions for this paticular project useage, does not consider
  5  * more about reuse, the data type all use double and assume two dimessional
  6  * matrix with same dinmenstion length(square matrix).
  7  */
  8 public class MatrixHelp {
  9
10     /** retrun squared distance of vector P1 and P2*/
11     public static double distance(double[] P1, double[] P2) {
12         int len = P1.length;  //P1.length must equal to P2.length
13         double sum = 0;
14         for (int i = 0; i < len; i++)
15             sum += (P1[i] - P2[i]) * (P1[i] - P2[i]);
16         return sum;
17     }
18
19     /** say H = transpose([a,b,c]) normal H means let H = transpose([a,b,c]/sqrt(a^2+b^2+c^2))*/
20     public static void normal(double[] P) {
21         int len = P.length;
22         double sum = 0;
23         for (int i = 0; i < len; i++)
24             sum += P[i];
25         for (int i = 0; i < len; i++)
26             P[i] /= sum;
27     }
28
29     /** return value of ||P||*/
30     public static double normValue(double[] P) {
31         double sum = 0;
32         for (int i = 0; i < P.length; i++)
33             sum += Math.abs(P[i]);
34         return sum;
35     }
36
37     /** sub return P1 - P2*/
38     public static double[] subVec(double[] P1, double[] P2) {
39         int len = P1.length;
40         double[] Result = new double[len];
41         for (int i = 0; i < len; i++)
42             Result[i] = P1[i] - P2[i];
43         return Result;
44     }
45
46     /** merge return (P1 + P2)/2.0*/
47     public static double[] mergeVec(double[] P1, double[] P2) {
48         int len = P1.length;
49         double[] Result = new double[len];
50         for (int i = 0; i < len; i++)
51             Result[i] = (P1[i] + P2[i])/2.0;
52         return Result;
53     }
54
55
56     /** normalize all the value in vec P to 0.001-0.0.9995*/
57     public static void normalize(double[] P) {
58         double dMaxValue = -1.0;
59         double dMinValue = 2.0;
60         int len = P.length;
61         for (int i = 0; i < len; i++) {
62             if (P[i] < dMinValue)
63                 dMinValue = P[i];
64             if (P[i] > dMaxValue)
65                 dMaxValue = P[i];
66         }
67         if (dMaxValue - dMinValue < 0.0000001)
68             return;
69         for (int i = 0; i < P.length; i++) {
70             if(P[i] <= (dMinValue*1.005)) //do not use boundary value
71                 P[i]= dMinValue*1.005;
72             if(P[i] >= (dMaxValue*0.995))
73                 P[i] = dMaxValue*0.995;
74             P[i]=(P[i]-dMinValue)/(dMaxValue-dMinValue);
75             P[i] = ((0.5-0.001)/0.5)*P[i] + 0.001;
76         }
77     }
78     /** return the result matrix of matrix A mutliply (transpose of A)*/
79     public static double[][] matrixMultiply(double[][] A) {
80         int len = A[0].length;
81         double[][] Result = new double[len][len];
82         for (int i = 0; i < len; i++)
83             for (int j = 0; j < len; j++)
84                 for (int k = 0; k < len; k++)
85                     Result[i][j] += A[i][k] * A[j][k]; //A[i][k] * AT[k][j]
86         return Result;
87     }
88
89     /** return the result matrix of matrix A mutliply matrix B,A and B of same dimention size*/
90     public static double[][] matrixMultiply(double[][] A, double[][] B) {
91         int len = A[0].length;
92         double[][] Result = new double[len][len];
93         for (int i = 0; i < len; i++)
94             for (int j = 0; j < len; j++)
95                 for (int k = 0; j < len; k++)
96                     Result[i][j] += A[i][k] * B[k][j];
97         return Result;
98     }
99     /** return the result vector of matrix A mutliply vector P*/
100     public static double[] matrixMultiplyVector(double[][] A, double[] P) {
101         int len = P.length;
102         double[] Result = new double[len];
103         for (int i = 0; i < len; i++)
104             for (int j = 0; j < len; j++)
105                 Result[i] += A[i][j] * P[j];
106         return Result;
107     }
108
109
110     /** return the result vector  of first transposing matrix A than mutliply vector P*/
111     public static double[] matrixTransposeMultiplyVector(double[][] A, double[] P) {
112         int len = P.length;
113         double[] Result = new double[len];
114         for (int i = 0; i < len; i++)
115             for (int j = 0; j < len; j++) {
116                 Result[i] += A[j][i] * P[j];
117             }
118
119         return Result;
120     }
121
122     /** for debug show all matrix data value*/
123     public static void printMatrix(double[][] A) {
124         int len = A[0].length;
125         for (int i = 0; i < len; i++) {
126             for (int j = 0; j < len; j++) {
127                 System.out.printf("%.2f", A[i][j]);
128                 System.out.print(" ");
129             }
130             System.out.println("");
131         }
132     }
133
134     /** for debug show all vector data value*/
135     public static void printVector(double[] P) {
136         for (int i = 0; i < P.length; i++)
137              System.out.printf("%.2f", P[i]);
138         System.out.println("\n");
139     }
140 }
141

//Ranker.java

1 /**
2  * Ranker an absturct class for calculating rank score of all sentences
3  * Right now there are two implementations
4  * PageRankRanker using pagerank algorithom and HitsRanker using hits
5  * algorithom.
6  */
7 public interface Ranker {
8     /**
9      * Given the Weight matrix A
10      * return the vec needed after doing iterate algorithm,eg pagerank, hits

.
11 */
12 public double[] getRankVec(double[][] A);
13 }

//PageRankRanker.java

//pager rank 算法的实现

1 /**
2  * The implementaion of Ranker based on pagerank algorithm.
3  */
4 public class PageRankRanker implements Ranker {
5
6     public double[] getRankVec(double[][] A) {
7         return  pageRankIterate(getInitMatrix(A), 0.85);
8     }
9
10
11     /**
12      * Kernal of page rank iterate
13      * Taking A as input matrixand d(damping factor,use 0.85 will be ok)
14      * and return the result vector with weight value
15      * for each sentence
16      */
17     public static double[] pageRankIterate(double[][] A, double d) {
18         int len = A[0].length;
19         double[][] P= new double[2][len];
20         int now = 0;
21         int other = 1;
22         int temp;
23         for (int i = 0; i < len; i++)
24             P[now][i] = 1.0/len;
25
26         double threshould = 0.0000001;  //spell wrong?
27         do {
28             P[other] = MatrixHelp.matrixMultiplyVector(A, P[now]);
29             for (int i = 0; i < len; i++)
30                 //P[other][i] = d * P[other][i] + (1 - d);
31                 P[other][i] = d * P[other][i] + (1 - d);
32             temp = now;
33             now = other;
34             other = temp;
35         } while(MatrixHelp.distance(P[now], P[other]) > threshould);
36
37         MatrixHelp.normalize(P[now]);
38         return P[now];
39     }
40
41    /**
42     * Get the init matrix for ranker to use based
43     * on input weight matrix A.
44     */
45     private static double[][] getInitMatrix(double[][] A) {
46         int len = A[0].length;
47         double[][] B = new double[len][len];
48         //for page rank init matrix
49         double weightSum;
50         for (int j = 0; j < len; j++) {
51             weightSum = 0.0;
52             for(int k = 0; k < len; k++) {
53                 //weightSum += A[k][j];
54                 weightSum += A[j][k];
55             }
56             for (int i = 0; i < len; i++) {
57                 if (weightSum > 0.0000001) {
58                     B[i][j] = A[i][j] / weightSum;
59                     //B[i][j] = A[j][i] /weightSum;
60                 } else if (i == j) {
61                     B[i][j] = 0.0;
62                 } else {
63                     B[i][j] = 1.00/(len - 1);
64                 }
65             }
66         }
67         return B;
68     }
69
70 }
71

//HitsRanker.java

//hits算法的实现

1 /**
2  * The implementation of Ranker based on hits algorithm.
3  */
4 public class HitsRanker implements Ranker {
5
6     public double[] getRankVec(double[][] A) {
7         return hitsIterate(A);
8     }
9
10     /** kernal of hits alogrithm*/
11     public static double[] hitsIterate(double[][] B) {
12         int len = B[0].length;
13         double[][] L = MatrixHelp.matrixMultiply(B); //B*BT
14
15         double[][] H = new double[2][len]; //hub
16         double[][] A = new double[2][len]; //authority
17         int now = 0;
18         int other = 1;
19         int temp;
20
21         for (int i = 0; i < len; i++) {
22             H[now][i] = A[now][i] = 1.0;
23         }
24         double threshould = 0.0000001;
25         do {
26             A[other] = MatrixHelp.matrixTransposeMultiplyVector(L, A[now]);
27             H[other] = MatrixHelp.matrixMultiplyVector(L, H[now]);
28             MatrixHelp.normal(A[other]);
29             MatrixHelp.normal(H[other]);
30             //swap now and other
31             temp = now;
32             now = other;
33             other = temp;
34         }while(MatrixHelp.normValue(MatrixHelp.subVec(H[now], H[other])) > threshould &&
35                MatrixHelp.normValue((MatrixHelp.subVec(A[now], A[other])))> threshould);
36
37         double[] R = MatrixHelp.mergeVec(A[now], H[now]);
38         MatrixHelp.normalize(R);
39         return R;
40     }
41 }
42