KickJava   Java API By Example, From Geeks To Geeks.

Java > Open Source Codes > org > mmbase > util > transformers > XmlField


1 package org.mmbase.util.transformers;
2
3 import java.io.*;
4 import java.util.HashMap JavaDoc;
5 import java.util.Map JavaDoc;
6
7 import javax.xml.transform.stream.StreamSource JavaDoc;
8 import javax.xml.transform.stream.StreamResult JavaDoc;
9
10 import java.util.regex.*;
11
12 import org.mmbase.util.StringObject;
13 import org.mmbase.util.ResourceLoader;
14 import org.mmbase.util.XSLTransformer;
15
16 import org.mmbase.util.logging.Logger;
17 import org.mmbase.util.logging.Logging;
18
19 /**
20  * XMLFields in MMBase. This class can encode such a field to several other formats.
21  *
22  * @author Michiel Meeuwissen
23  * @version $Id: XmlField.java,v 1.46 2006/04/10 13:34:19 pierre Exp $
24  * @todo THIS CLASS NEEDS A CONCEPT! It gets a bit messy.
25  */

26
27 public class XmlField extends ConfigurableStringTransformer implements CharTransformer {
28
29     private static final Logger log = Logging.getLoggerInstance(XmlField.class);
30
31     // can be decoded:
32
public final static int POORBODY = 5;
33     public final static int RICHBODY = 6;
34
35     // cannot yet be encoded even..
36
public final static int HTML_INLINE = 7;
37     public final static int HTML_BLOCK = 8;
38     public final static int HTML_BLOCK_BR = 9;
39     public final static int HTML_BLOCK_NOSURROUNDINGP = 10;
40     public final static int HTML_BLOCK_BR_NOSURROUNDINGP = 11;
41     public final static int HTML_BLOCK_LIST = 12;
42     public final static int HTML_BLOCK_LIST_BR = 13;
43     public final static int HTML_BLOCK_LIST_NOSURROUNDINGP = 14;
44     public final static int HTML_BLOCK_LIST_BR_NOSURROUNDINGP = 15;
45
46     // cannot be decoded:
47
public final static int ASCII = 51;
48     public final static int XHTML = 52;
49
50     private final static String JavaDoc CODING = "UTF-8"; // This class only support UTF-8 now.
51

52
53
54     private static boolean isListChar(char c) {
55         return c == '-' || c == '*';
56     }
57     private static String JavaDoc listTag(char c) {
58         return c == '-' ? "ul" : "ol";
59     }
60
61     /**
62      * Takes a string object, finds list structures and changes those to XML
63      */

64     static void handleList(StringObject obj) {
65         // handle lists
66
// make <ul> possible (not yet nested), with -'s on the first char of line.
67
int inList = 0; //
68
int pos = 0;
69         if (obj.length() < 3) {
70             return;
71         }
72         char listChar = '-';
73         if (isListChar(obj.charAt(0)) && !isListChar(obj.charAt(1))) { // hoo, we even _start_ with a list;
74
listChar = obj.charAt(0);
75             obj.insert(0, "\n"); // in the loop \n- is deleted, so it must be there.
76
} else {
77             while (true) {
78                 int pos1 = obj.indexOf("\n-", pos); // search the first
79
int pos2 = obj.indexOf("\n*", pos); // search the first
80

81                 pos = (pos1 > 0 && pos1 < pos2) || pos2 < 0 ? pos1 : pos2;
82                 if (pos == -1 || obj.length() <= pos + 2) break;
83                 if (! isListChar(obj.charAt(pos + 2))) {
84                     listChar = obj.charAt(pos + 1);
85                     break;
86                 }
87                 pos += 2;
88             }
89         }
90
91         listwhile : while (pos != -1) {
92             if (inList == 0) { // not yet in list
93
inList++; // now we are
94
obj.delete(pos, 2); // delete \n-
95
// remove spaces..
96
while (pos < obj.length() && obj.charAt(pos) == ' ') {
97                     obj.delete(pos, 1);
98                 }
99                 if (pos > 0) {
100                     obj.insert(pos, "\n");
101                     pos += 1;
102                 }
103                 obj.insert(pos, "<" + listTag(listChar) + ">\r<li>"); // insert 9 chars.
104
pos += 9;
105
106             } else { // already in list
107
if (obj.charAt(pos + 1) != listChar) { // end of list
108
obj.delete(pos, 1); // delete \n
109
obj.insert(pos, "</li>\r</" + listTag(listChar) + ">\n");
110                     pos += 12;
111                     inList--;
112                 } else { // not yet end
113
obj.delete(pos, 2); // delete \n-
114
// remove spaces..
115
while (pos < obj.length() && obj.charAt(pos) == ' ')
116                         obj.delete(pos, 1);
117                     obj.insert(pos, "</li>\r<li>");
118                     pos += 10;
119                 }
120             }
121             if (inList > 0) { // search for new line
122
pos = obj.indexOf("\n", pos);
123                 if (pos == -1)
124                     break; // no new line found? End of list, of text.
125
if (pos + 1 == obj.length()) {
126                     obj.delete(pos, 1);
127                     break; // if end of text, simply remove the newline.
128
}
129                 while (obj.charAt(pos + 1) == ' ') {
130                     // if next line starts with space, this new line does not count. This makes it possible to have some formatting in a <li>
131
pos = obj.indexOf("\n", pos + 1);
132                     if (pos + 1 == obj.length()) {
133                         obj.delete(pos, 1);
134                         break listwhile; // nothing to do...
135
}
136                 }
137             } else { // search for next item
138
while (true) {
139                     int pos1 = obj.indexOf("\n-", pos);
140                     int pos2 = obj.indexOf("\n*", pos);
141
142                     pos = (pos1 > 0 && pos1 < pos2) || pos2 < 0 ? pos1 : pos2;
143                     if (pos == -1 || obj.length() <= pos + 2) break;
144                     if (! isListChar(obj.charAt(pos + 2))) {
145                         listChar = obj.charAt(pos + 1);
146                         break; // should not start with two -'s, because this is some seperation line
147
}
148                     pos += 2;
149                 }
150             }
151         }
152         // make sure that the list is closed:
153
while (inList > 0) { // lists in lists not already supported, but if we will...
154
obj.insert(obj.length(), "</li></" + listTag(listChar) + ">\n");
155             inList--; // always finish with a new line, it might be needed for the finding of paragraphs.
156
}
157
158     }
159     /**
160      * If you want to add a _ in your text, that should be possible too...
161      * Should be done last, because no tags can appear in <em>
162
163      * @param ch This is '_' or e.g. '*'
164      * @param tag The tag to produce, e.g. "em" or "strong"
165      */

166     // test cases:
167
// I cite _m_pos_! -> <mmxf><p>I cite <em>m_pos</em>!</p></mmxf>
168

169     static void handleEmph(StringObject obj, char ch, String JavaDoc tag) {
170
171         obj.replace("" + ch + ch, "&#95;"); // makes it possible to escape underscores (or what you choose)
172

173         // Emphasizing. This is perhaps also asking for trouble, because
174
// people will try to use it like <font> or other evil
175
// things. But basicly emphasizion is content, isn't it?
176

177         String JavaDoc sch = "" + ch;
178
179         int posEmphOpen = obj.indexOf(sch, 0);
180         int posTagOpen = obj.indexOf("<", 0); // must be closed before next tag opens.
181

182
183         OUTER:
184         while (posEmphOpen != -1) {
185
186             if (posTagOpen > 0 &&
187                 posTagOpen < posEmphOpen) { // ensure that we are not inside existing tags
188
int posTagClose = obj.indexOf(">", posTagOpen);
189                 if (posTagClose == -1) break;
190                 posEmphOpen = obj.indexOf(sch, posTagClose);
191                 posTagOpen = obj.indexOf("<", posTagClose);
192                 continue;
193             }
194
195             if (posEmphOpen + 1 >= obj.length()) break; // no use, nothing can follow
196

197             if ((posEmphOpen > 0 && Character.isLetterOrDigit(obj.charAt(posEmphOpen - 1))) ||
198                 (! Character.isLetterOrDigit(obj.charAt(posEmphOpen + 1)))) {
199                 // _ is inside a word, ignore that.
200
// or not starting a word
201
posEmphOpen = obj.indexOf(sch, posEmphOpen + 1);
202                 continue;
203             }
204
205             // now find closing _.
206
int posEmphClose = obj.indexOf(sch, posEmphOpen + 1);
207             if (posEmphClose == -1) break;
208             while((posEmphClose + 1) < obj.length() &&
209                   (Character.isLetterOrDigit(obj.charAt(posEmphClose + 1)))
210                   ) {
211                 posEmphClose = obj.indexOf(sch, posEmphClose + 1);
212                 if (posEmphClose == -1) break OUTER;
213             }
214
215             if (posTagOpen > 0
216                 && posEmphClose > posTagOpen) {
217                 posEmphOpen = obj.indexOf(sch, posTagOpen); // a tag opened before emphasis close, ignore then too, and re-search
218
continue;
219             }
220
221             // realy do replacing now
222
obj.delete(posEmphClose, 1);
223             obj.insert(posEmphClose,"</" + tag + ">");
224             obj.delete(posEmphOpen, 1);
225             obj.insert(posEmphOpen, "<" + tag + ">");
226             posEmphClose += 7;
227
228             posEmphOpen = obj.indexOf(sch, posEmphClose);
229             posTagOpen = obj.indexOf("<", posEmphClose);
230
231         }
232
233         obj.replace("&#95;", sch);
234     }
235
236     /**
237      * Some paragraphs are are really \sections. So this handler can
238      * be done after handleParagraphs. It will search the paragraphs
239      * which are really headers, and changes them. A header, in our
240      * 'rich' text format, is a paragraph starting with one or more $.
241      * If there are more then one, the resulting <section> tags are
242      * going to be nested.
243      *
244      */

245     static void handleHeaders(StringObject obj) {
246         // handle headers
247
int requested_level;
248         char ch;
249         int level = 0; // start without being in section.
250
int pos = obj.indexOf("<p>$", 0);
251         OUTER:
252         while (pos != -1) {
253             obj.delete(pos, 4); // remove <p>$
254

255             requested_level = 1;
256             // find requested level:
257
while (true) {
258                 ch = obj.charAt(pos);
259                 if (ch == '$') {
260                     requested_level++;
261                     obj.delete(pos, 1);
262                 } else {
263                     if (ch == ' ') {
264                         obj.delete(pos, 1);
265                     }
266                     break;
267                 }
268             }
269             StringBuffer JavaDoc add = new StringBuffer JavaDoc();
270             for (; requested_level <= level; level--) {
271                 // same or higher level section
272
add.append("</section>");
273             }
274             level++;
275             for (; requested_level > level; level++) {
276                 add.append("<section>");
277             }
278             add.append("<section><h>");
279
280             obj.insert(pos, add.toString());
281             pos += add.length();
282
283             // search end title of header;
284

285             while (true) { // oh yes, and don't allow _ in title.
286
int pos1 = obj.indexOf("_", pos);
287                 int posP = obj.indexOf("</p>", pos);
288                 int posNl = obj.indexOf("\n", pos);
289                 int delete;
290                 int pos2;
291                 if ((posP > 0 && posP < posNl) || posNl == -1) {
292                     pos2 = posP;
293                     delete = 4;
294                 } else {
295                     pos2 = posNl;
296                     delete = 1;
297                 }
298                 if (pos1 < pos2 && pos1 > 0) {
299                     obj.delete(pos1, 1);
300                 } else {
301                     pos = pos2;
302                     if (pos == -1) {
303                         break OUTER; // not found, could not happen.
304
}
305                     obj.delete(pos, delete);
306                     obj.insert(pos, "</h>");
307                     pos += 4;
308                     if (delete == 1) {
309                         obj.insert(pos, "<p>");
310                         pos += 3;
311                     }
312                     break;
313                 }
314             }
315             pos = obj.indexOf("<p>$", pos); // search the next one.
316
}
317         // ready, close all sections still open.
318
for (; level > 0; level--) {
319             obj.insert(obj.length(), "</section>");
320         }
321
322     }
323
324     // check if on that position the string object contains a <ul> or <ol>
325
static private boolean containsListTag(StringObject obj, int pos) {
326         return obj.length() > pos + 4 &&
327                obj.charAt(pos) == '<' &&
328                (obj.charAt(pos+1) == 'u' || obj.charAt(pos+1) == 'o') &&
329                obj.charAt(pos+2) == 'l' &&
330                obj.charAt(pos+3) == '>';
331     }
332
333     /**
334      * Make <p> </p> tags.
335      * @param leaveExtraNewLines (defaults to false) if false, 2 or more newlines starts a new p. If true, every 2 newlines starts new p, and every extra new line simply stays (inside the p).
336      * @param surroundingP (defaults to true) wether the surrounding &lt;p&gt; should be included too.
337      */

338     static void handleParagraphs(StringObject obj, boolean leaveExtraNewLines, boolean surroundingP) {
339         handleParagraphs(obj, leaveExtraNewLines, surroundingP, false);
340     }
341
342     /**
343      * Make &lt;p> &lt;/p> tags.
344      * Note that if placeListsInsideP is <code>false</code>, the code generated with lists becomes akin to:
345      * &lt;p&gt;...&lt;/p&gt;&lt;ul&gt;...&lt;/ul&gt;&lt;p&gt;...&lt;/p&gt;
346      *
347      * If placeListsInsideP is <code>true</code>, the code becomes:
348      * &lt;p&gt;...&lt;ul&gt;...&lt;/ul&gt;...&lt;/p&gt;
349      *
350      * If there is no content in front of the first list, or after the last list, those paragraphs are empty and may not be
351      * added.
352      *
353      * @param leaveExtraNewLines (defaults to false) if false, 2 or more newlines starts a new p. If true, every 2 newlines starts new p, and every extra new line simply stays (inside the p).
354      * @param surroundingP (defaults to true) wether the surrounding &lt;p&gt; should be included too.
355      * @param placeListsInsideP (defaults to false) wether a list should be placed inside a &lt;p&gt; (as allowed by xhtml2).
356      */

357     static void handleParagraphs(StringObject obj, boolean leaveExtraNewLines, boolean surroundingP, boolean placeListsInsideP) {
358         // handle paragraphs:
359
boolean inParagraph = true;
360         int pos = 0;
361         // we should actually test if the first bit is a list, and if so, skip it
362
if (surroundingP) {
363             if (!placeListsInsideP && containsListTag(obj,pos)) {
364                 //note: this does not take into account nested lists
365
int posEnd = obj.indexOf("</" + obj.charAt(pos + 1)+ "l>", pos + 1);
366                 // only continue this if this is a balanced list
367
if (posEnd != -1) {
368                     pos = posEnd +5;
369                     if (obj.length() > pos && obj.charAt(pos) == '\n') {
370                         obj.delete(pos, 1);
371                     }
372                     if (pos >= obj.length()) {
373                         return;
374                     }
375                 }
376             }
377             obj.insert(pos, "<p>");
378             pos += 3;
379         } else {
380             // if the code starts with a list, and it should be placed outside a paragraph,
381
// add a \n to make sure that the list is parsed
382
if (!placeListsInsideP && containsListTag(obj,pos)) {
383                 obj.insert(pos, "\n");
384             }
385         }
386         boolean start = true;
387         while (pos < obj.length()) {
388             // one or more empty lines.
389
if (start) {
390                 start = false;
391                 pos = obj.indexOf("\n", pos);
392             } else {
393                 pos = obj.indexOf("\n", pos + 1);
394             }
395             if (pos == -1) break;
396
397             int skip = 1;
398             int l = obj.length();
399             while(pos + skip < l && Character.isWhitespace(obj.charAt(pos + skip))) {
400                 if (obj.charAt(pos + skip ) == '\n') {
401                     break;
402                 }
403                 skip++;
404             }
405             if (pos + skip >= l) break;
406             // we need at least 2 lines for a paragraph.
407
// however, if we instead have a list now, and we are not placeListsInsideP,
408
// we should still terminate the paragraph, as the ul then falls outside
409
// the paragraph.
410
if (obj.charAt(pos + skip) != '\n') {
411                 if (!containsListTag(obj,pos + skip)) {
412                     continue;
413                 }
414                 obj.delete(pos, skip);
415                 if (placeListsInsideP) {
416                     int posEnd = obj.indexOf("</" + obj.charAt(pos + 1)+ "l>", pos + 1);
417                     if (posEnd != -1) {
418                         pos = posEnd +5;
419                         if (obj.length() > pos && obj.charAt(pos) == '\n' &&
420                             (obj.length() == pos + 1 || obj.charAt(pos+1) != '\n')) {
421                             obj.delete(pos, 1);
422                         }
423                     }
424                     continue;
425                 }
426             } else {
427                 // delete the 2 new lines of the p.
428
obj.delete(pos, skip + 1);
429             }
430
431             if (leaveExtraNewLines) {
432                 while (obj.length() > pos && Character.isWhitespace(obj.charAt(pos))) {
433                     pos++;
434                 }
435             } else {
436                 while (obj.length() > pos && Character.isWhitespace(obj.charAt(pos))) {
437                     obj.delete(pos, 1); // delete the extra new lines too
438
}
439             }
440             if (inParagraph) { // close the previous paragraph.
441
obj.insert(pos, "</p>");
442                 inParagraph = false;
443                 pos += 4;
444             }
445             // initialize skip for leading whitespace
446
skip = 0;
447             // if the next code happens to be a list tag (ul/ol), we can do two things:
448
// - place the list outside the paragraph (if we are not placeListsInsideP).
449
// In that case, we should not start a new
450
// paragraph until after the list. Moreover, if we are then at the end of the
451
// text we should not include a paragraph at all unless it is enforced.
452
// - include de ul in the paragraph. In that case, we simply continue as normal
453
if (!placeListsInsideP && obj.length() > pos && containsListTag(obj,pos)) {
454                 int posEnd = obj.indexOf("</" + obj.charAt(pos + 1)+ "l>", pos + 1);
455                 // only continue this if this is a balanced list
456
if (posEnd != -1) {
457                     pos = posEnd + 5;
458                     // skip all whitespace after a list.
459
int newlines = 0;
460                     while (obj.length() > (pos + skip) && Character.isWhitespace(obj.charAt(pos + skip))) {
461                         if (obj.charAt(pos + skip ) == '\n') {
462                             newlines++;
463                         }
464                         if (newlines > 1 && leaveExtraNewLines) {
465                             skip++; // count whitespace after the second newline,
466
// to include in the next paragraph
467
} else {
468                             obj.delete(pos, 1); // delete whitespace
469
}
470                     }
471                     // if no text follows, and we don't need an extra paragraphs, skip
472
// note that we always add a <p> if we have the 'ommitsurrounding' option
473
// - because the option expects this.
474
if (surroundingP && pos == obj.length()) {
475                         break;
476                     }
477                 }
478             }
479             // next paragraph.
480
obj.insert(pos, "\r<p>");
481             pos += skip + 4;
482             inParagraph = true;
483         }
484         if (inParagraph) { // in current impl. this is always true
485

486             // read whole text, but stil in paragraph
487
// if text ends with newline, take it away, because it then means </p> rather then <br />
488
if (obj.length() > 0) {
489                 if (obj.charAt(obj.length() - 1) == '\n') {
490                     obj.delete(obj.length() - 1, 1);
491                 }
492             }
493             if (surroundingP) {
494                 obj.insert(obj.length(), "</p>");
495             }
496         }
497     }
498
499     /**
500      * Wikipedia syntax for tables. (simplified)
501      * <pre>
502      * {|
503      * | a || b || c
504      * |-
505      * | d || e || f
506      * |}
507      * </pre>
508      * or e.g.
509      * <pre>
510      * {|-
511      * |+ caption
512      * ! A !! B !! C
513      * |-
514      * | d
515      * | e
516      * | f
517      * |}
518      * </pre>
519      *@since MMBase 1.8
520      */

521     static void handleTables(StringObject obj) {
522         int tables = 0;
523         int pos = 0;
524         while (pos != -1) {
525             // always at beginning of line when here.
526
int l = obj.length();
527             if (pos + 2 < l && ( obj.charAt(pos) == '{' && obj.charAt(pos + 1) == '|')) {
528                 int skip = 2;
529                 // allow for starting with {|- as well
530
if (pos + skip < l && obj.charAt(pos + skip) == '-') skip++;
531                 // allow some trailing whitespace
532
while(pos + skip < l && Character.isWhitespace(obj.charAt(pos + skip))) {
533                     if (obj.charAt(pos + skip ) == '\n') {
534                         break;
535                     }
536                     skip++;
537                 }
538                 if (pos + skip >= l) break;
539                 if (obj.charAt(pos + skip) != '\n') {
540                     pos = obj.indexOf("\n", pos + skip);
541                     continue;
542                 }
543                 skip ++;
544                 log.debug("ok, this is a table!");
545                 // don't use l onwards, length of obj will change
546

547                 if (pos > 0 && obj.charAt(pos - 1) == '\n') {
548                     obj.delete(pos - 1, 1);
549                     pos --;
550                 }
551                 if (pos > 0 && obj.charAt(pos - 1) == '\n') {
552                     obj.delete(pos - 1, 1);
553                     pos --;
554                 }
555                 tables ++;
556                 obj.delete(pos, skip);
557                 obj.insert(pos, "</p><table>");
558                 pos += 11;
559                 if (obj.charAt(pos) == '|' && obj.charAt(pos + 1) == '+') {
560                     obj.delete(pos, 2);
561                     obj.insert(pos, "<caption>");
562                     pos += 9;
563                     pos = obj.indexOf("\n", pos);
564                     obj.delete(pos, 1);
565                     obj.insert(pos, "</caption>");
566                     pos += 10;
567                 }
568                 obj.insert(pos, "<tr>");
569                 pos += 4;
570             }
571             if (pos >= obj.length()) break;
572             // always in tr here.
573
if (tables > 0) {
574                 if (obj.charAt(pos) == '|') {
575                     obj.delete(pos, 1);
576
577                     if (pos + 2 < obj.length() && (obj.charAt(pos) == '-' && obj.charAt(pos + 1) == '\n')) {
578                         obj.delete(pos, 2);
579                         obj.insert(pos, "</tr><tr>");
580                         pos += 9;
581                     } else if (pos + 1 < obj.length() && (obj.charAt(pos) == '}' && (pos + 2 == obj.length() || obj.charAt(pos + 1) == '\n'))) {
582                         obj.delete(pos, 2);
583                         obj.insert(pos, "</tr></table>");
584                         tables--;
585                         pos += 13;
586                         if (tables == 0) {
587                             obj.insert(pos, "<p>");
588                             pos +=3;
589                         }
590                         while (pos < obj.length() && obj.charAt(pos) == '\n') obj.delete(pos, 1);
591                     } else if (pos + 3 < obj.length() && (obj.charAt(pos) == '\n' && obj.charAt(pos + 1) == '{' && obj.charAt(pos + 2) == '|')) {
592                         obj.delete(pos, 3);
593                         obj.insert(pos, "<td><table><tr>");
594                         pos += 15;
595                         tables++;
596                     } else {
597                         obj.insert(pos, "<td>");
598                         pos += 4;
599                         int nl = obj.indexOf("\n", pos);
600                         int pipe = obj.indexOf("||", pos);
601                         int end = pipe == -1 || nl < pipe ? nl : pipe;
602                         if (end == -1) end += obj.length();
603                         pos = end;
604                         obj.delete(pos, 1);
605                         obj.insert(pos, "</td>");
606                         pos += 5;
607                     }
608                     continue;
609                 } else if (obj.charAt(pos) == '!') {
610                     obj.delete(pos, 1);
611                     obj.insert(pos, "<th>");
612                     pos += 4;
613                     int nl = obj.indexOf("\n", pos);
614                     int pipe = obj.indexOf("!!", pos);
615                     int end = pipe == -1 || nl < pipe ? nl : pipe;
616                     if (end == -1) end += obj.length();
617                     pos = end;
618                     obj.delete(pos, 1);
619                     obj.insert(pos, "</th>");
620                     pos += 5;
621                     continue;
622                 } else {
623                     pos = obj.indexOf("\n", pos) + 1;
624                     if (pos >= obj.length()) break;
625                     // oddd. what to do know?
626
}
627             } else { // not in table, ignore find next new line
628
pos = obj.indexOf("\n", pos) + 1;
629                 if (pos == 0) break;
630                 if (pos >= obj.length()) break;
631             }
632         }
633         while (tables > 0) {
634             obj.insert(pos, "</tr></table>");
635             pos+= 13;
636             tables--;
637             if (tables == 0) {
638                 obj.insert(pos, "<p>");
639                 pos += 3;
640                 while (pos < obj.length() && obj.charAt(pos) == '\n') obj.delete(pos, 1);
641             }
642         }
643
644     }
645     /**
646      * Removes all new lines and space which are too much.
647      */

648     static void cleanupText(StringObject obj) {
649         // remaining new lines have no meaning.
650
obj.replace(">\n", ">"); // don't replace by space if it is just after a tag, it could have a meaning then.
651
obj.replace("\n", " "); // replace by space, because people could use it as word boundary.
652
// remaining double spaces have no meaning as well:
653
int pos = obj.indexOf(" ", 0);
654         while (pos != -1) {
655             pos++;
656             while (obj.length() > pos && obj.charAt(pos) == ' ') {
657                 obj.delete(pos, 1);
658             }
659             pos = obj.indexOf(" ", pos);
660         }
661         // we used \r for non significant newlines:
662
obj.replace("\r", "");
663
664     }
665
666     /**
667      * Only escape, clean up.
668      * @since MMBase-1.7
669      */

670     protected static void handleFormat(StringObject obj, boolean format) {
671         if (format) {
672             obj.replace("\r", "\n");
673         } else {
674             cleanupText(obj);
675         }
676
677     }
678     protected static String JavaDoc prepareDataString(String JavaDoc data) {
679         return Xml.XMLEscape(data).replaceAll("\r", ""); // drop returns (\r), we work with newlines, \r will be used as a help.
680
}
681     protected static StringObject prepareData(String JavaDoc data) {
682         return new StringObject(prepareDataString(data));
683     }
684
685     protected static void handleRich(StringObject obj, boolean sections, boolean leaveExtraNewLines, boolean surroundingP) {
686         handleRich(obj, sections, leaveExtraNewLines, surroundingP, false);
687     }
688
689     protected static void handleRich(StringObject obj, boolean sections, boolean leaveExtraNewLines, boolean surroundingP, boolean placeListsInsideP) {
690         // the order _is_ important!
691
handleList(obj);
692         handleTables(obj);
693         handleParagraphs(obj, leaveExtraNewLines, surroundingP, placeListsInsideP);
694         if (sections) {
695             handleHeaders(obj);
696         }
697         handleEmph(obj, '_', "em");
698         handleEmph(obj, '*', "strong");
699     }
700
701     static void handleNewlines(StringObject obj) {
702         obj.replace("</ul>\n", "</ul>"); // otherwise we will wind up with the silly "</ul><br />" the \n was necessary for </ul></p>
703
obj.replace("\n", "<br />\r"); // handle new remaining newlines.
704
}
705
706     private static Pattern wikiWrappingAnchor = Pattern.compile("\\[(\\w+):(.*?)\\]");
707     private static Pattern wikiP = Pattern.compile("<p>\\[(\\w+)\\]");
708     private static Pattern wikiSection = Pattern.compile("<section><h>\\[(\\w+)\\]");
709     private static Pattern wikiAnchor = Pattern.compile("\\[(\\w+)\\]");
710
711     public static String JavaDoc wikiToXML(String JavaDoc data) {
712         Matcher wrappingAnchors = wikiWrappingAnchor.matcher(prepareDataString(data));
713         data = wrappingAnchors.replaceAll("<a id=\"$1\">$2</a>");
714         StringObject obj = new StringObject(data);
715         handleRich(obj, true, false, true);
716         handleFormat(obj, false);
717         String JavaDoc string = obj.toString();
718         Matcher ps = wikiP.matcher(string);
719         string = ps.replaceAll("<p id=\"$1\">");
720         Matcher sections = wikiSection.matcher(string);
721         string = sections.replaceAll("<section id=\"$1\"><h>");
722         Matcher anchors = wikiAnchor.matcher(string);
723         string = anchors.replaceAll("<a id=\"$1\" />");
724         return string;
725
726     }
727
728     /**
729      * Defines a kind of 'rich' text format. This is a way to easily
730      * type structured text in XML. The XML tags which can be
731      * produced by this are all HTML as well.
732      *
733      * This is a generalisation of the MMBase html() functions which
734      * does similar duties, but hopefully this one is better, and more
735      * powerfull too.
736      *
737      * The following things are recognized:
738      * <ul>
739      * <li> Firstly, XMLEscape is called.</li>
740      * <li> A line starting with an asterix (*) will start an unnumberd
741      * list. The first new line not starting with a space or an other
742      * asterix will end the list </li>
743      * <li> Underscores are translated to the emphasize HTML-tag</li>
744      * <li> You can create a header tag by by starting a line with a dollar signs</li>
745      * <li> A paragraph can be begun (and ended) with an empty line.</li>
746      * </ul>
747      *
748      * Test with commandline: java org.mmbase.util.Encode RICH_TEXT (reads from STDIN)
749      *
750      * @param data text to convert
751      * @param format if the resulting XML must be nicely formatted (default: false)
752      * @return the converted text
753      */

754
755     public static String JavaDoc richToXML(String JavaDoc data, boolean format) {
756         StringObject obj = prepareData(data);
757         handleRich(obj, true, true, true);
758         handleNewlines(obj);
759         handleFormat(obj, format);
760         return obj.toString();
761     }
762     public static String JavaDoc richToXML(String JavaDoc data) {
763         return richToXML(data, false);
764     }
765     /**
766      * As richToXML but a little less rich. Which means that only one new line is non significant.
767      * @see #richToXML
768      */

769
770     public static String JavaDoc poorToXML(String JavaDoc data, boolean format) {
771         StringObject obj = prepareData(data);
772         handleRich(obj, true, false,true);
773         handleFormat(obj, format);
774         return obj.toString();
775     }
776
777     public static String JavaDoc poorToXML(String JavaDoc data) {
778         return poorToXML(data, false);
779     }
780
781     /**
782      * So poor, that it actually generates pieces of XHTML 1.1 blocks (so, no use of sections).
783      *
784      * @see #richToXML
785      * @since MMBase-1.7
786      */

787     public static String JavaDoc richToHTMLBlock(String JavaDoc data, boolean multipibleBrs, boolean surroundingP, boolean placeListsInsideP) {
788         StringObject obj = prepareData(data);
789         handleRich(obj, false, multipibleBrs, surroundingP, placeListsInsideP); // no <section> tags, leave newlines if multipble br's requested
790
handleNewlines(obj);
791         handleFormat(obj, false);
792         return obj.toString();
793     }
794
795
796     public static String JavaDoc richToHTMLBlock(String JavaDoc data) {
797         return richToHTMLBlock(data, false, true, true);
798     }
799
800     public static String JavaDoc richToHTMLBlock(String JavaDoc data, boolean multipibleBrs, boolean surroundingP) {
801         return richToHTMLBlock(data, multipibleBrs, surroundingP, true);
802     }
803
804     /**
805      * So poor, that it actually generates pieces of XHTML 1.1 inlines (so, no use of section, br, p).
806      *
807      * @since MMBase-1.7
808      */

809     public static String JavaDoc poorToHTMLInline(String JavaDoc data) {
810         StringObject obj = prepareData(data);
811         // don't add newlines.
812
handleFormat(obj, false);
813         handleEmph(obj, '_', "em");
814         handleEmph(obj, '*', "strong");
815         return obj.toString();
816     }
817
818
819     /**
820      * Base function for XSL conversions.
821      */

822
823     protected static String JavaDoc XSLTransform(String JavaDoc xslFile, String JavaDoc data) {
824         try {
825             java.net.URL JavaDoc u = ResourceLoader.getConfigurationRoot().getResource("xslt/" + xslFile);
826             java.io.StringWriter JavaDoc res = new java.io.StringWriter JavaDoc();
827             XSLTransformer.transform(new StreamSource JavaDoc(new StringReader(data)), u, new StreamResult JavaDoc(res), null);
828             return res.toString();
829         } catch (javax.xml.transform.TransformerException JavaDoc te) {
830             return te.getMessage();
831         }
832     }
833
834     protected static void validate(String JavaDoc incoming) throws FormatException {
835         try {
836             if (log.isDebugEnabled()) {
837                 log.debug("Validating " + incoming);
838             }
839             javax.xml.parsers.DocumentBuilderFactory JavaDoc dfactory = javax.xml.parsers.DocumentBuilderFactory.newInstance();
840
841             // turn validating on..
842
dfactory.setValidating(true);
843             dfactory.setNamespaceAware(true);
844             javax.xml.parsers.DocumentBuilder JavaDoc documentBuilder = dfactory.newDocumentBuilder();
845
846             // in order to find the dtd.....
847
org.mmbase.util.XMLEntityResolver resolver = new org.mmbase.util.XMLEntityResolver();
848             documentBuilder.setEntityResolver(resolver);
849
850             // in order to log our xml-errors
851
StringBuffer JavaDoc errorBuff = new StringBuffer JavaDoc();
852             ErrorHandler errorHandler = new ErrorHandler(errorBuff);
853             documentBuilder.setErrorHandler(errorHandler);
854             // documentBuilder.init();
855
java.io.InputStream JavaDoc input = new java.io.ByteArrayInputStream JavaDoc(incoming.getBytes(CODING));
856             documentBuilder.parse(input);
857
858             if (!resolver.hasDTD()) {
859                 throw new FormatException("no doc-type specified for the xml");
860             }
861             if (errorHandler.errorOrWarning) {
862                 throw new FormatException("error in xml: \n" + errorBuff.toString());
863             }
864         } catch (javax.xml.parsers.ParserConfigurationException JavaDoc pce) {
865             throw new FormatException("[sax parser] not well formed xml: " + pce.toString());
866         } catch (org.xml.sax.SAXException JavaDoc se) {
867             log.debug("", se);
868             //throw new FormatException("[sax] not well formed xml: "+se.toString() + "("+se.getMessage()+")");
869
} catch (java.io.IOException JavaDoc ioe) {
870             throw new FormatException("[io] not well formed xml: " + ioe.toString());
871         }
872     }
873
874     protected static class FormatException extends java.lang.Exception JavaDoc {
875         FormatException(String JavaDoc msg) {
876             super(msg);
877         }
878     }
879
880     // Catch any errors or warnings,....
881
static class ErrorHandler implements org.xml.sax.ErrorHandler JavaDoc {
882         boolean errorOrWarning;
883         StringBuffer JavaDoc errorBuff;
884
885         ErrorHandler(StringBuffer JavaDoc errorBuff) {
886             super();
887             this.errorBuff = errorBuff;
888             errorOrWarning = false;
889         }
890
891         // all methods from org.xml.sax.ErrorHandler
892
// from org.xml.sax.ErrorHandler
893
public void fatalError(org.xml.sax.SAXParseException JavaDoc exc) {
894             errorBuff.append("FATAL[" + getLocationString(exc) + "]:" + exc.getMessage() + "\n");
895             errorOrWarning = true;
896         }
897
898         // from org.xml.sax.ErrorHandler
899
public void error(org.xml.sax.SAXParseException JavaDoc exc) {
900             errorBuff.append("Error[" + getLocationString(exc) + "]: " + exc.getMessage() + "\n");
901             errorOrWarning = true;
902         }
903
904         // from org.xml.sax.ErrorHandler
905
public void warning(org.xml.sax.SAXParseException JavaDoc exc) {
906             errorBuff.append("Warning[" + getLocationString(exc) + "]:" + exc.getMessage() + "\n");
907             errorOrWarning = true;
908         }
909
910         // helper methods
911
/**
912          * Returns a string of the location.
913          */

914         private String JavaDoc getLocationString(org.xml.sax.SAXParseException JavaDoc ex) {
915             StringBuffer JavaDoc str = new StringBuffer JavaDoc();
916             String JavaDoc systemId = ex.getSystemId();
917             if (systemId != null) {
918                 int index = systemId.lastIndexOf('/');
919                 if (index != -1) {
920                     systemId = systemId.substring(index + 1);
921                 }
922                 str.append(systemId);
923             }
924             str.append(" line:");
925             str.append(ex.getLineNumber());
926             str.append(" column:");
927             str.append(ex.getColumnNumber());
928             return str.toString();
929         }
930     }
931
932     public XmlField() {
933         super();
934     }
935     public XmlField(int to) {
936         super(to);
937     }
938
939     public Map JavaDoc transformers() {
940         Map JavaDoc h = new HashMap JavaDoc();
941         h.put("MMXF_ASCII", new Config(XmlField.class, ASCII, "Converts xml to ASCII (cannoted be reversed)"));
942         h.put("MMXF_BODY_RICH", new Config(XmlField.class, RICHBODY, "XHTML 2 compliant XML."));
943         h.put("MMXF_BODY_POOR", new Config(XmlField.class, POORBODY, "XHTML 2 compliant XML, but withough <br/> tags"));
944         h.put("MMXF_HTML_INLINE", new Config(XmlField.class, HTML_INLINE, "Decodes only escaping and with <em>"));
945         h.put("MMXF_HTML_BLOCK", new Config(XmlField.class, HTML_BLOCK, "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>"));
946         h.put("MMXF_HTML_BLOCK_BR", new Config(XmlField.class, HTML_BLOCK_BR, "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>"));
947         h.put("MMXF_HTML_BLOCK_NOSURROUNDINGP", new Config(XmlField.class, HTML_BLOCK_NOSURROUNDINGP, "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>"));
948         h.put("MMXF_HTML_BLOCK_BR_NOSURROUNDINGP", new Config(XmlField.class, HTML_BLOCK_BR_NOSURROUNDINGP, "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>"));
949         h.put("MMXF_HTML_BLOCK_LIST", new Config(XmlField.class, HTML_BLOCK_LIST, "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>, with <ul> inside the <p>"));
950         h.put("MMXF_HTML_BLOCK_LIST_NOSURROUNDINGP", new Config(XmlField.class, HTML_BLOCK_LIST_NOSURROUNDINGP, "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>, with <ul> inside the <p>"));
951         h.put("MMXF_HTML_BLOCK_LIST_BR", new Config(XmlField.class, HTML_BLOCK_LIST_BR, "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>, with <ul> inside the <p>"));
952         h.put("MMXF_HTML_BLOCK_LIST_BR_NOSURROUNDINGP", new Config(XmlField.class, HTML_BLOCK_LIST_BR_NOSURROUNDINGP, "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>, with <ul> inside the <p>"));
953         h.put("MMXF_XHTML", new Config(XmlField.class, XHTML, "Converts to piece of XHTML"));
954         return h;
955     }
956
957     public String JavaDoc transform(String JavaDoc data) {
958         switch (to) {
959         case RICHBODY :
960         case POORBODY :
961             throw new UnsupportedOperationException JavaDoc();
962             // XXXX
963
// needing richtext xslt here.
964
//return XSLTransform("mmxf2rich.xslt", XML_TAGSTART + data + XML_TAGEND);
965
case ASCII :
966             return XSLTransform("text.xslt", data);
967         case HTML_BLOCK:
968         case HTML_BLOCK_BR:
969         case HTML_INLINE:
970             throw new UnsupportedOperationException JavaDoc("Cannot transform");
971         default :
972             throw new UnknownCodingException(getClass(), to);
973         }
974     }
975
976     public String JavaDoc transformBack(String JavaDoc r) {
977         String JavaDoc result = null;
978         switch (to) {
979         case RICHBODY :
980             result = richToXML(r);
981             // rich will not be validated... Cannot be used yet!!
982
break;
983         case POORBODY :
984             result = poorToXML(r);
985             break;
986         case HTML_BLOCK:
987             result = richToHTMLBlock(r, false, true, true);
988             break;
989         case HTML_BLOCK_BR:
990             result = richToHTMLBlock(r, true, true, true);
991             break;
992         case HTML_BLOCK_NOSURROUNDINGP:
993             result = richToHTMLBlock(r, false, false, true);
994             break;
995         case HTML_BLOCK_BR_NOSURROUNDINGP:
996             result = richToHTMLBlock(r, true, false, true);
997             break;
998
999         case HTML_BLOCK_LIST:
1000            result = richToHTMLBlock(r, false, true, false);
1001            break;
1002        case HTML_BLOCK_LIST_BR:
1003            result = richToHTMLBlock(r, true, true, false);
1004            break;
1005        case HTML_BLOCK_LIST_NOSURROUNDINGP:
1006            result = richToHTMLBlock(r, false, false, false);
1007            break;
1008        case HTML_BLOCK_LIST_BR_NOSURROUNDINGP:
1009            result = richToHTMLBlock(r, true, false, false);
1010            break;
1011
1012        case HTML_INLINE:
1013            result = poorToHTMLInline(r);
1014            break;
1015        case ASCII :
1016            throw new UnsupportedOperationException JavaDoc("Cannot transform");
1017        default :
1018            throw new UnknownCodingException(getClass(), to);
1019        }
1020        return result;
1021    }
1022
1023    public String JavaDoc getEncoding() {
1024        switch (to) {
1025        case RICHBODY :
1026            return "MMXF_BODY_RICH";
1027        case POORBODY :
1028            return "MMXF_BODY_POOR";
1029        case HTML_BLOCK :
1030            return "MMXF_HTML_BLOCK";
1031        case HTML_BLOCK_BR :
1032            return "MMXF_HTML_BLOCK_BR";
1033        case HTML_BLOCK_NOSURROUNDINGP :
1034            return "MMXF_HTML_BLOCK_NOSURROUNDINGP";
1035        case HTML_BLOCK_BR_NOSURROUNDINGP :
1036            return "MMXF_HTML_BLOCK_BR_NOSURROUNDINGP";
1037        case HTML_BLOCK_LIST :
1038            return "MMXF_HTML_BLOCK_LIST";
1039        case HTML_BLOCK_LIST_BR :
1040            return "MMXF_HTML_BLOCK_LIST_BR";
1041        case HTML_BLOCK_LIST_NOSURROUNDINGP :
1042            return "MMXF_HTML_BLOCK_LIST_NOSURROUNDINGP";
1043        case HTML_BLOCK_LIST_BR_NOSURROUNDINGP :
1044            return "MMXF_HTML_BLOCK_LIST_BR_NOSURROUNDINGP";
1045        case HTML_INLINE :
1046            return "MMXF_HTML_INLINE";
1047        case ASCII :
1048            return "MMXF_ASCII";
1049        default :
1050            throw new UnknownCodingException(getClass(), to);
1051        }
1052    }
1053}
1054
Popular Tags