XmlField


1   package org.mmbase.util.transformers;
2   
3   import java.io.*;
4   import java.util.HashMap  ;
5   import java.util.Map  ;
6   
7   import javax.xml.transform.stream.StreamSource  ;
8   import javax.xml.transform.stream.StreamResult  ;
9   
10  import java.util.regex.*;
11  
12  import org.mmbase.util.StringObject;
13  import org.mmbase.util.ResourceLoader;
14  import org.mmbase.util.XSLTransformer;
15  
16  import org.mmbase.util.logging.Logger;
17  import org.mmbase.util.logging.Logging;
18  
19  /**
20   * XMLFields in MMBase. This class can encode such a field to several other formats.
21   *
22   * @author Michiel Meeuwissen
23   * @version $Id: XmlField.java,v 1.46 2006/04/10 13:34:19 pierre Exp $
24   * @todo   THIS CLASS NEEDS A CONCEPT! It gets a bit messy.
25   */
26  
27  public class XmlField extends ConfigurableStringTransformer implements CharTransformer {
28  
29      private static final Logger log = Logging.getLoggerInstance(XmlField.class);
30  
31      // can be decoded:
32      public final static int POORBODY = 5;
33      public final static int RICHBODY = 6;
34  
35      // cannot yet be encoded even..
36      public final static int HTML_INLINE = 7;
37      public final static int HTML_BLOCK = 8;
38      public final static int HTML_BLOCK_BR  = 9;
39      public final static int HTML_BLOCK_NOSURROUNDINGP = 10;
40      public final static int HTML_BLOCK_BR_NOSURROUNDINGP = 11;
41      public final static int HTML_BLOCK_LIST = 12;
42      public final static int HTML_BLOCK_LIST_BR = 13;
43      public final static int HTML_BLOCK_LIST_NOSURROUNDINGP = 14;
44      public final static int HTML_BLOCK_LIST_BR_NOSURROUNDINGP = 15;
45  
46      // cannot be decoded:
47      public final static int ASCII = 51;
48      public final static int XHTML = 52;
49  
50      private final static String   CODING = "UTF-8"; // This class only support UTF-8 now.
51  
52  
53  
54      private static boolean isListChar(char c) {
55          return c == '-' || c == '*';
56      }
57      private static String   listTag(char c) {
58          return c == '-' ? "ul" : "ol";
59      }
60  
61      /**
62       * Takes a string object, finds list structures and changes those to XML
63       */
64      static void handleList(StringObject obj) {
65          // handle lists
66          // make <ul> possible (not yet nested), with -'s on the first char of line.
67          int inList = 0; //
68          int pos = 0;
69          if (obj.length() < 3) {
70              return;
71          }
72          char listChar = '-';
73          if (isListChar(obj.charAt(0)) && !isListChar(obj.charAt(1))) { // hoo, we even _start_ with a list;
74              listChar = obj.charAt(0);
75              obj.insert(0, "\n"); // in the loop \n- is deleted, so it must be there.
76          } else {
77              while (true) {
78                  int pos1 = obj.indexOf("\n-", pos); // search the first
79                  int pos2 = obj.indexOf("\n*", pos); // search the first
80  
81                  pos = (pos1 > 0 && pos1 < pos2) || pos2 < 0 ? pos1 : pos2;
82                  if (pos == -1 || obj.length() <= pos + 2) break;
83                  if (! isListChar(obj.charAt(pos + 2))) {
84                      listChar = obj.charAt(pos + 1);
85                      break;
86                  }
87                  pos += 2;
88              }
89          }
90  
91          listwhile : while (pos != -1) {
92              if (inList == 0) { // not yet in list
93                  inList++; // now we are
94                  obj.delete(pos, 2); // delete \n-
95                  // remove spaces..
96                  while (pos < obj.length() && obj.charAt(pos) == ' ') {
97                      obj.delete(pos, 1);
98                  }
99                  if (pos > 0) {
100                     obj.insert(pos, "\n");
101                     pos += 1;
102                 }
103                 obj.insert(pos, "<" + listTag(listChar) + ">\r<li>"); // insert 9 chars.
104                 pos += 9;
105 
106             } else { // already in list
107                 if (obj.charAt(pos + 1) != listChar) { // end of list
108                     obj.delete(pos, 1); // delete \n
109                     obj.insert(pos, "</li>\r</" + listTag(listChar) + ">\n");
110                     pos += 12;
111                     inList--;
112                 } else { // not yet end
113                     obj.delete(pos, 2); // delete \n-
114                     // remove spaces..
115                     while (pos < obj.length() && obj.charAt(pos) == ' ')
116                         obj.delete(pos, 1);
117                     obj.insert(pos, "</li>\r<li>");
118                     pos += 10;
119                 }
120             }
121             if (inList > 0) { // search for new line
122                 pos = obj.indexOf("\n", pos);
123                 if (pos == -1)
124                     break; // no new line found? End of list, of text.
125                 if (pos + 1 == obj.length()) {
126                     obj.delete(pos, 1);
127                     break; // if end of text, simply remove the newline.
128                 }
129                 while (obj.charAt(pos + 1) == ' ') {
130                     // if next line starts with space, this new line does not count. This makes it possible to have some formatting in a <li>
131                     pos = obj.indexOf("\n", pos + 1);
132                     if (pos + 1 == obj.length()) {
133                         obj.delete(pos, 1);
134                         break listwhile; // nothing to do...
135                     }
136                 }
137             } else { // search for next item
138                 while (true) {
139                     int pos1 = obj.indexOf("\n-", pos);
140                     int pos2 = obj.indexOf("\n*", pos);
141 
142                     pos = (pos1 > 0 && pos1 < pos2) || pos2 < 0 ? pos1 : pos2;
143                     if (pos == -1 || obj.length() <= pos + 2) break;
144                     if (! isListChar(obj.charAt(pos + 2))) {
145                         listChar = obj.charAt(pos + 1);
146                         break; // should not start with two -'s, because this is some seperation line
147                     }
148                     pos += 2;
149                 }
150             }
151         }
152         // make sure that the list is closed:
153         while (inList > 0) { // lists in lists not already supported, but if we will...
154             obj.insert(obj.length(), "</li></" + listTag(listChar) + ">\n");
155             inList--; // always finish with a new line, it might be needed for the finding of paragraphs.
156         }
157 
158     }
159     /**
160      * If you want to add a _ in your text, that should be possible too...
161      * Should be done last, because no tags can appear in <em>
162 
163      * @param ch This is '_' or e.g. '*'
164      * @param tag The tag to produce, e.g. "em" or "strong"
165      */
166     // test cases:
167     // I cite _m_pos_! -> <mmxf><p>I cite <em>m_pos</em>!</p></mmxf>
168 
169     static void handleEmph(StringObject obj, char ch, String   tag) {
170 
171         obj.replace("" + ch + ch, "&#95;"); // makes it possible to escape underscores (or what you choose)
172 
173         // Emphasizing. This is perhaps also asking for trouble, because
174         // people will try to use it like <font> or other evil
175         // things. But basicly emphasizion is content, isn't it?
176 
177         String   sch = "" + ch;
178 
179         int posEmphOpen = obj.indexOf(sch, 0);
180         int posTagOpen = obj.indexOf("<", 0); // must be closed before next tag opens.
181 
182 
183         OUTER:
184         while (posEmphOpen != -1) {
185 
186             if (posTagOpen > 0 &&
187                 posTagOpen < posEmphOpen) { // ensure that we are not inside existing tags
188                 int posTagClose = obj.indexOf(">", posTagOpen);
189                 if (posTagClose == -1) break;
190                 posEmphOpen = obj.indexOf(sch, posTagClose);
191                 posTagOpen  = obj.indexOf("<", posTagClose);
192                 continue;
193             }
194 
195             if (posEmphOpen + 1 >= obj.length()) break; // no use, nothing can follow
196 
197             if ((posEmphOpen > 0 && Character.isLetterOrDigit(obj.charAt(posEmphOpen - 1))) ||
198                 (! Character.isLetterOrDigit(obj.charAt(posEmphOpen + 1)))) {
199                 // _ is inside a word, ignore that.
200                 // or not starting a word
201                 posEmphOpen = obj.indexOf(sch, posEmphOpen + 1);
202                 continue;
203             }
204 
205             // now find closing _.
206             int posEmphClose = obj.indexOf(sch, posEmphOpen + 1);
207             if (posEmphClose == -1) break;
208             while((posEmphClose + 1) < obj.length() &&
209                   (Character.isLetterOrDigit(obj.charAt(posEmphClose + 1)))
210                   ) {
211                 posEmphClose = obj.indexOf(sch, posEmphClose + 1);
212                 if (posEmphClose == -1) break OUTER;
213             }
214 
215             if (posTagOpen > 0
216                 && posEmphClose > posTagOpen) {
217                 posEmphOpen = obj.indexOf(sch, posTagOpen); // a tag opened before emphasis close, ignore then too, and re-search
218                 continue;
219             }
220 
221             // realy do replacing now
222             obj.delete(posEmphClose, 1);
223             obj.insert(posEmphClose,"</" + tag + ">");
224             obj.delete(posEmphOpen, 1);
225             obj.insert(posEmphOpen, "<" + tag + ">");
226             posEmphClose += 7;
227 
228             posEmphOpen = obj.indexOf(sch, posEmphClose);
229             posTagOpen  = obj.indexOf("<", posEmphClose);
230 
231         }
232 
233         obj.replace("&#95;", sch);
234     }
235 
236     /**
237      * Some paragraphs are are really \sections. So this handler can
238      * be done after handleParagraphs. It will search the paragraphs
239      * which are really headers, and changes them. A header, in our
240      * 'rich' text format, is a paragraph starting with one or more $.
241      * If there are more then one, the resulting <section> tags are
242      * going to be nested.
243      *
244      */
245     static void handleHeaders(StringObject obj) {
246         // handle headers
247         int requested_level;
248         char ch;
249         int level = 0; // start without being in section.
250         int pos = obj.indexOf("<p>$", 0);
251         OUTER:
252         while (pos != -1) {
253             obj.delete(pos, 4); // remove <p>$
254 
255             requested_level = 1;
256             // find requested level:
257             while (true) {
258                 ch = obj.charAt(pos);
259                 if (ch == '$') {
260                     requested_level++;
261                     obj.delete(pos, 1);
262                 } else {
263                     if (ch == ' ') {
264                         obj.delete(pos, 1);
265                     }
266                     break;
267                 }
268             }
269             StringBuffer   add = new StringBuffer  ();
270             for (; requested_level <= level; level--) {
271                 // same or higher level section
272                 add.append("</section>");
273             }
274             level++;
275             for (; requested_level > level; level++) {
276                 add.append("<section>");
277             }
278             add.append("<section><h>");
279 
280             obj.insert(pos, add.toString());
281             pos += add.length();
282 
283             // search end title of  header;
284 
285             while (true) { // oh yes, and don't allow _ in title.
286                 int pos1 = obj.indexOf("_", pos);
287                 int posP  = obj.indexOf("</p>", pos);
288                 int posNl = obj.indexOf("\n", pos);
289                 int delete;
290                 int  pos2;
291                 if ((posP > 0 && posP < posNl) || posNl == -1) {
292                     pos2 =  posP;
293                     delete = 4;
294                 } else {
295                     pos2 = posNl;
296                     delete = 1;
297                 }
298                 if (pos1 < pos2 && pos1 > 0) {
299                     obj.delete(pos1, 1);
300                 } else {
301                     pos = pos2;
302                     if (pos == -1) {
303                         break OUTER; // not found, could not happen.
304                     }
305                     obj.delete(pos, delete);
306                     obj.insert(pos, "</h>");
307                     pos += 4;
308                     if (delete == 1) {
309                         obj.insert(pos, "<p>");
310                         pos += 3;
311                     }
312                     break;
313                 }
314             }
315             pos = obj.indexOf("<p>$", pos); // search the next one.
316         }
317         // ready, close all sections still open.
318         for (; level > 0; level--) {
319             obj.insert(obj.length(), "</section>");
320         }
321 
322     }
323 
324     // check if on that position the string object contains a <ul> or <ol>
325     static private boolean containsListTag(StringObject obj, int pos) {
326         return obj.length() > pos + 4 &&
327                obj.charAt(pos) == '<' &&
328                (obj.charAt(pos+1) == 'u' || obj.charAt(pos+1) == 'o') &&
329                obj.charAt(pos+2) == 'l' &&
330                obj.charAt(pos+3) == '>';
331     }
332 
333     /**
334      * Make <p> </p> tags.
335      * @param leaveExtraNewLines (defaults to false) if false, 2 or more newlines starts a new p. If true, every 2 newlines starts new p, and every extra new line simply stays (inside the p).
336      * @param surroundingP (defaults to true) wether the surrounding &lt;p&gt; should be included too.
337      */
338     static void handleParagraphs(StringObject obj, boolean leaveExtraNewLines, boolean surroundingP) {
339         handleParagraphs(obj, leaveExtraNewLines, surroundingP, false);
340     }
341 
342     /**
343      * Make &lt;p> &lt;/p> tags.
344      * Note that if placeListsInsideP is <code>false</code>, the code generated with lists becomes akin to:
345      * &lt;p&gt;...&lt;/p&gt;&lt;ul&gt;...&lt;/ul&gt;&lt;p&gt;...&lt;/p&gt;
346      *
347      * If placeListsInsideP is <code>true</code>, the code becomes:
348      * &lt;p&gt;...&lt;ul&gt;...&lt;/ul&gt;...&lt;/p&gt;
349      *
350      * If there is no content in front of the first list, or after the last list, those paragraphs are empty and may not be
351      * added.
352      *
353      * @param leaveExtraNewLines (defaults to false) if false, 2 or more newlines starts a new p. If true, every 2 newlines starts new p, and every extra new line simply stays (inside the p).
354      * @param surroundingP (defaults to true) wether the surrounding &lt;p&gt; should be included too.
355      * @param placeListsInsideP (defaults to false) wether a list should be placed inside a &lt;p&gt; (as allowed by xhtml2).
356      */
357     static void handleParagraphs(StringObject obj, boolean leaveExtraNewLines, boolean surroundingP, boolean placeListsInsideP) {
358         // handle paragraphs:
359         boolean inParagraph = true;
360         int pos = 0;
361         // we should actually test if the first bit is a list, and if so, skip it
362         if (surroundingP) {
363             if (!placeListsInsideP && containsListTag(obj,pos)) {
364                 //note: this does not take into account nested lists
365                 int posEnd = obj.indexOf("</" + obj.charAt(pos + 1)+ "l>", pos + 1);
366                 // only continue this if this is a balanced list
367                 if (posEnd != -1) {
368                     pos = posEnd +5;
369                     if (obj.length() > pos && obj.charAt(pos) == '\n') {
370                         obj.delete(pos, 1);
371                     }
372                     if (pos >= obj.length()) {
373                         return;
374                     }
375                 }
376             }
377             obj.insert(pos, "<p>");
378             pos += 3;
379         } else {
380             // if the code starts with a list, and it should be placed outside a paragraph,
381             // add a \n to make sure that the list is parsed
382             if (!placeListsInsideP && containsListTag(obj,pos)) {
383                 obj.insert(pos, "\n");
384             }
385         }
386         boolean start = true;
387         while (pos < obj.length()) {
388             // one or more empty lines.
389             if (start) {
390                 start = false;
391                 pos = obj.indexOf("\n", pos);
392             } else {
393                 pos = obj.indexOf("\n", pos + 1);
394             }
395             if (pos == -1) break;
396 
397             int skip = 1;
398             int l = obj.length();
399             while(pos + skip < l && Character.isWhitespace(obj.charAt(pos + skip))) {
400                 if (obj.charAt(pos + skip ) == '\n') {
401                     break;
402                 }
403                 skip++;
404             }
405             if (pos + skip >= l) break;
406             // we need at least 2 lines for a paragraph.
407             // however, if we instead have a list now, and we are not placeListsInsideP,
408             // we should still terminate the paragraph, as the ul then falls outside
409             // the paragraph.
410             if (obj.charAt(pos + skip) != '\n') {
411                 if (!containsListTag(obj,pos + skip)) {
412                     continue;
413                 }
414                 obj.delete(pos, skip);
415                 if (placeListsInsideP) {
416                     int posEnd = obj.indexOf("</" + obj.charAt(pos + 1)+ "l>", pos + 1);
417                     if (posEnd != -1) {
418                         pos = posEnd +5;
419                         if (obj.length() > pos && obj.charAt(pos) == '\n' &&
420                             (obj.length() == pos + 1 || obj.charAt(pos+1) != '\n')) {
421                             obj.delete(pos, 1);
422                         }
423                     }
424                     continue;
425                 }
426             } else {
427                 // delete the 2 new lines of the p.
428                 obj.delete(pos, skip + 1);
429             }
430 
431             if (leaveExtraNewLines) {
432                 while (obj.length() > pos && Character.isWhitespace(obj.charAt(pos))) {
433                     pos++;
434                 }
435             } else {
436                 while (obj.length() > pos && Character.isWhitespace(obj.charAt(pos))) {
437                     obj.delete(pos, 1); // delete the extra new lines too
438                 }
439             }
440             if (inParagraph) { // close the previous paragraph.
441                 obj.insert(pos, "</p>");
442                 inParagraph = false;
443                 pos += 4;
444             }
445             // initialize skip for leading whitespace
446             skip = 0;
447             // if the next code happens to be a list tag (ul/ol), we can do two things:
448             // - place the list outside the paragraph (if we are not placeListsInsideP).
449             //   In that case, we should not start a new
450             //   paragraph until after the list. Moreover, if we are then at the end of the
451             //   text we should not include a paragraph at all unless it is enforced.
452             // - include de ul in the paragraph. In that case, we simply continue as normal
453             if (!placeListsInsideP && obj.length() > pos && containsListTag(obj,pos)) {
454                 int posEnd = obj.indexOf("</" + obj.charAt(pos + 1)+ "l>", pos + 1);
455                 // only continue this if this is a balanced list
456                 if (posEnd != -1) {
457                     pos = posEnd + 5;
458                     // skip all whitespace after a list.
459                     int newlines = 0;
460                     while (obj.length() > (pos + skip) && Character.isWhitespace(obj.charAt(pos + skip))) {
461                         if (obj.charAt(pos + skip ) == '\n') {
462                             newlines++;
463                         }
464                         if (newlines > 1 && leaveExtraNewLines) {
465                             skip++; // count whitespace after the second newline,
466                                     // to include in the next paragraph
467                         } else {
468                             obj.delete(pos, 1); // delete whitespace
469                         }
470                     }
471                     // if no text follows, and we don't need an extra paragraphs, skip
472                     // note that we always add a <p> if we have the 'ommitsurrounding' option
473                     // - because the option expects this.
474                     if (surroundingP && pos == obj.length()) {
475                         break;
476                     }
477                 }
478             }
479             // next paragraph.
480             obj.insert(pos, "\r<p>");
481             pos += skip + 4;
482             inParagraph = true;
483         }
484         if (inParagraph) { // in current impl. this is always true
485 
486             // read whole text, but stil in paragraph
487             // if text ends with newline, take it away, because it then means </p> rather then <br />
488             if (obj.length() > 0) {
489                 if (obj.charAt(obj.length() - 1) == '\n') {
490                     obj.delete(obj.length() - 1, 1);
491                 }
492             }
493             if (surroundingP) {
494                 obj.insert(obj.length(), "</p>");
495             }
496         }
497     }
498 
499     /**
500      * Wikipedia syntax for tables. (simplified)
501      * <pre>
502      * {|
503      * | a || b || c
504      * |-
505      * | d || e || f
506      * |}
507      * </pre>
508      * or e.g.
509      * <pre>
510      * {|-
511      * |+ caption
512      * ! A !! B !! C
513      * |-
514      * | d
515      * | e
516      * | f
517      * |}
518      * </pre>
519      *@since MMBase 1.8
520      */
521     static void handleTables(StringObject obj) {
522         int tables = 0;
523         int pos = 0;
524         while (pos != -1) {
525             // always at beginning of line when here.
526             int l = obj.length();
527             if (pos + 2 < l && ( obj.charAt(pos) == '{' && obj.charAt(pos + 1) == '|')) {
528                 int skip = 2;
529                 // allow for starting with {|- as well
530                 if (pos + skip < l && obj.charAt(pos + skip) == '-') skip++;
531                 // allow some trailing whitespace
532                 while(pos + skip < l && Character.isWhitespace(obj.charAt(pos + skip))) {
533                     if (obj.charAt(pos + skip ) == '\n') {
534                         break;
535                     }
536                     skip++;
537                 }
538                 if (pos + skip >= l) break;
539                 if (obj.charAt(pos + skip) != '\n') {
540                     pos = obj.indexOf("\n", pos + skip);
541                     continue;
542                 }
543                 skip ++;
544                 log.debug("ok, this is a table!");
545                 // don't use l onwards, length of obj will change
546 
547                 if (pos > 0 && obj.charAt(pos - 1) == '\n') {
548                     obj.delete(pos - 1, 1);
549                     pos --;
550                 }
551                 if (pos > 0 && obj.charAt(pos - 1) == '\n') {
552                     obj.delete(pos - 1, 1);
553                     pos --;
554                 }
555                 tables ++;
556                 obj.delete(pos, skip);
557                 obj.insert(pos, "</p><table>");
558                 pos += 11;
559                 if (obj.charAt(pos) == '|' && obj.charAt(pos + 1) == '+') {
560                     obj.delete(pos, 2);
561                     obj.insert(pos, "<caption>");
562                     pos += 9;
563                     pos = obj.indexOf("\n", pos);
564                     obj.delete(pos, 1);
565                     obj.insert(pos, "</caption>");
566                     pos += 10;
567                 }
568                 obj.insert(pos, "<tr>");
569                 pos += 4;
570             }
571             if (pos >= obj.length()) break;
572             // always in tr here.
573             if (tables > 0) {
574                 if (obj.charAt(pos) == '|') {
575                     obj.delete(pos, 1);
576 
577                     if (pos + 2 < obj.length() && (obj.charAt(pos) == '-' && obj.charAt(pos + 1) == '\n')) {
578                         obj.delete(pos, 2);
579                         obj.insert(pos, "</tr><tr>");
580                         pos += 9;
581                     } else if (pos + 1 < obj.length() && (obj.charAt(pos) == '}' && (pos + 2 == obj.length() || obj.charAt(pos + 1) == '\n'))) {
582                         obj.delete(pos, 2);
583                         obj.insert(pos, "</tr></table>");
584                         tables--;
585                         pos += 13;
586                         if (tables == 0) {
587                             obj.insert(pos, "<p>");
588                             pos +=3;
589                         }
590                         while (pos < obj.length() && obj.charAt(pos) == '\n') obj.delete(pos, 1);
591                     } else if (pos + 3 < obj.length() && (obj.charAt(pos) == '\n' && obj.charAt(pos + 1) == '{' && obj.charAt(pos + 2) == '|')) {
592                         obj.delete(pos, 3);
593                         obj.insert(pos, "<td><table><tr>");
594                         pos += 15;
595                         tables++;
596                     } else {
597                         obj.insert(pos, "<td>");
598                         pos += 4;
599                         int nl = obj.indexOf("\n", pos);
600                         int pipe = obj.indexOf("||", pos);
601                         int end = pipe == -1 || nl < pipe ? nl : pipe;
602                         if (end == -1) end += obj.length();
603                         pos = end;
604                         obj.delete(pos, 1);
605                         obj.insert(pos, "</td>");
606                         pos += 5;
607                     }
608                     continue;
609                 } else if (obj.charAt(pos) == '!') {
610                     obj.delete(pos, 1);
611                     obj.insert(pos, "<th>");
612                     pos += 4;
613                     int nl = obj.indexOf("\n", pos);
614                     int pipe = obj.indexOf("!!", pos);
615                     int end = pipe == -1 || nl < pipe ? nl : pipe;
616                     if (end == -1) end += obj.length();
617                     pos = end;
618                     obj.delete(pos, 1);
619                     obj.insert(pos, "</th>");
620                     pos += 5;
621                     continue;
622                 } else {
623                     pos = obj.indexOf("\n", pos) + 1;
624                     if (pos >= obj.length()) break;
625                     // oddd. what to do know?
626                 }
627             } else { // not in table, ignore find next new line
628                 pos = obj.indexOf("\n", pos) + 1;
629                 if (pos == 0) break;
630                 if (pos >= obj.length()) break;
631             }
632         }
633         while (tables > 0) {
634             obj.insert(pos, "</tr></table>");
635             pos+= 13;
636             tables--;
637             if (tables == 0) {
638                 obj.insert(pos, "<p>");
639                 pos += 3;
640                 while (pos < obj.length() && obj.charAt(pos) == '\n') obj.delete(pos, 1);
641             }
642         }
643 
644     }
645     /**
646      * Removes all new lines and space which are too much.
647      */
648     static void cleanupText(StringObject obj) {
649         // remaining new lines have no meaning.
650         obj.replace(">\n", ">"); // don't replace by space if it is just after a tag, it could have a meaning then.
651         obj.replace("\n", " "); // replace by space, because people could use it as word boundary.
652         // remaining double spaces have no meaning as well:
653         int pos = obj.indexOf(" ", 0);
654         while (pos != -1) {
655             pos++;
656             while (obj.length() > pos && obj.charAt(pos) == ' ') {
657                 obj.delete(pos, 1);
658             }
659             pos = obj.indexOf(" ", pos);
660         }
661         // we used \r for non significant newlines:
662         obj.replace("\r", "");
663 
664     }
665 
666     /**
667      * Only escape, clean up.
668      * @since MMBase-1.7
669      */
670     protected static void handleFormat(StringObject obj, boolean format) {
671         if (format) {
672             obj.replace("\r", "\n");
673         } else {
674             cleanupText(obj);
675         }
676 
677     }
678     protected static String   prepareDataString(String   data) {
679         return Xml.XMLEscape(data).replaceAll("\r", ""); // drop returns (\r), we work with newlines, \r will be used as a help.
680     }
681     protected static StringObject prepareData(String   data) {
682         return new StringObject(prepareDataString(data));
683     }
684 
685     protected static void handleRich(StringObject obj, boolean sections, boolean leaveExtraNewLines, boolean surroundingP) {
686         handleRich(obj, sections, leaveExtraNewLines, surroundingP, false);
687     }
688 
689     protected static void handleRich(StringObject obj, boolean sections, boolean leaveExtraNewLines, boolean surroundingP, boolean placeListsInsideP) {
690         // the order _is_ important!
691         handleList(obj);
692         handleTables(obj);
693         handleParagraphs(obj, leaveExtraNewLines, surroundingP, placeListsInsideP);
694         if (sections) {
695             handleHeaders(obj);
696         }
697         handleEmph(obj, '_', "em");
698         handleEmph(obj, '*', "strong");
699     }
700 
701     static void handleNewlines(StringObject obj) {
702         obj.replace("</ul>\n", "</ul>"); // otherwise we will wind up with the silly "</ul><br />" the \n was necessary for </ul></p>
703         obj.replace("\n", "<br />\r");  // handle new remaining newlines.
704     }
705 
706     private static Pattern wikiWrappingAnchor = Pattern.compile("\\[(\\w+):(.*?)\\]");
707     private static Pattern wikiP = Pattern.compile("<p>\\[(\\w+)\\]");
708     private static Pattern wikiSection = Pattern.compile("<section><h>\\[(\\w+)\\]");
709     private static Pattern wikiAnchor = Pattern.compile("\\[(\\w+)\\]");
710 
711     public static String   wikiToXML(String   data) {
712         Matcher wrappingAnchors = wikiWrappingAnchor.matcher(prepareDataString(data));
713         data = wrappingAnchors.replaceAll("<a id=\"$1\">$2</a>");
714         StringObject obj = new StringObject(data);
715         handleRich(obj, true, false, true);
716         handleFormat(obj, false);
717         String   string = obj.toString();
718         Matcher ps = wikiP.matcher(string);
719         string = ps.replaceAll("<p id=\"$1\">");
720         Matcher sections = wikiSection.matcher(string);
721         string = sections.replaceAll("<section id=\"$1\"><h>");
722         Matcher anchors = wikiAnchor.matcher(string);
723         string = anchors.replaceAll("<a id=\"$1\" />");
724         return string;
725 
726     }
727 
728     /**
729      * Defines a kind of 'rich' text format. This is a way to easily
730      * type structured text in XML.  The XML tags which can be
731      * produced by this are all HTML as well.
732      *
733      * This is a generalisation of the MMBase html() functions which
734      * does similar duties, but hopefully this one is better, and more
735      * powerfull too.
736      *
737      * The following things are recognized:
738      * <ul>
739      *  <li> Firstly, XMLEscape is called.</li>
740      *  <li> A line starting with an asterix (*) will start an unnumberd
741      *       list. The first new line not starting with a space or an other
742      *       asterix will end the list </li>
743      *  <li> Underscores are translated to the emphasize HTML-tag</li>
744      *  <li> You can create a header tag by by starting a line with a dollar signs</li>
745      *  <li> A paragraph can be begun (and ended) with an empty line.</li>
746      * </ul>
747      *
748      * Test with commandline: java org.mmbase.util.Encode RICH_TEXT (reads from STDIN)
749      *
750      * @param data text to convert
751      * @param format if the resulting XML must be nicely formatted (default: false)
752      * @return the converted text
753      */
754 
755     public static String   richToXML(String   data, boolean format) {
756         StringObject obj = prepareData(data);
757         handleRich(obj, true, true, true);
758         handleNewlines(obj);
759         handleFormat(obj, format);
760         return obj.toString();
761     }
762     public static String   richToXML(String   data) {
763         return richToXML(data, false);
764     }
765     /**
766      * As richToXML but a little less rich. Which means that only one new line is non significant.
767      * @see #richToXML
768      */
769 
770     public static String   poorToXML(String   data, boolean format) {
771         StringObject obj = prepareData(data);
772         handleRich(obj, true, false,true);
773         handleFormat(obj, format);
774         return obj.toString();
775     }
776 
777     public static String   poorToXML(String   data) {
778         return poorToXML(data, false);
779     }
780 
781     /**
782      * So poor, that it actually generates pieces of XHTML 1.1 blocks (so, no use of sections).
783      *
784      * @see #richToXML
785      * @since MMBase-1.7
786      */
787     public static String   richToHTMLBlock(String   data, boolean multipibleBrs, boolean surroundingP, boolean placeListsInsideP) {
788         StringObject obj = prepareData(data);
789         handleRich(obj, false, multipibleBrs, surroundingP, placeListsInsideP);   // no <section> tags, leave newlines if multipble br's requested
790         handleNewlines(obj);
791         handleFormat(obj, false);
792         return obj.toString();
793     }
794 
795 
796     public static String   richToHTMLBlock(String   data) {
797         return richToHTMLBlock(data, false, true, true);
798     }
799 
800     public static String   richToHTMLBlock(String   data, boolean multipibleBrs, boolean surroundingP) {
801         return richToHTMLBlock(data, multipibleBrs, surroundingP, true);
802     }
803 
804     /**
805      * So poor, that it actually generates pieces of XHTML 1.1 inlines (so, no use of section, br, p).
806      *
807      * @since MMBase-1.7
808      */
809     public static String   poorToHTMLInline(String   data) {
810         StringObject obj = prepareData(data);
811         // don't add newlines.
812         handleFormat(obj, false);
813         handleEmph(obj, '_', "em");
814         handleEmph(obj, '*', "strong");
815         return obj.toString();
816     }
817 
818 
819     /**
820      * Base function for XSL conversions.
821      */
822 
823     protected static String   XSLTransform(String   xslFile, String   data) {
824         try {
825             java.net.URL   u = ResourceLoader.getConfigurationRoot().getResource("xslt/" + xslFile);
826             java.io.StringWriter   res = new java.io.StringWriter  ();
827             XSLTransformer.transform(new StreamSource  (new StringReader(data)), u, new StreamResult  (res), null);
828             return res.toString();
829         } catch (javax.xml.transform.TransformerException   te) {
830             return te.getMessage();
831         }
832     }
833 
834     protected static void validate(String   incoming) throws FormatException {
835         try {
836             if (log.isDebugEnabled()) {
837                 log.debug("Validating " + incoming);
838             }
839             javax.xml.parsers.DocumentBuilderFactory   dfactory = javax.xml.parsers.DocumentBuilderFactory.newInstance();
840 
841             // turn validating on..
842             dfactory.setValidating(true);
843             dfactory.setNamespaceAware(true);
844             javax.xml.parsers.DocumentBuilder   documentBuilder = dfactory.newDocumentBuilder();
845 
846             // in order to find the dtd.....
847             org.mmbase.util.XMLEntityResolver resolver = new org.mmbase.util.XMLEntityResolver();
848             documentBuilder.setEntityResolver(resolver);
849 
850             // in order to log our xml-errors
851             StringBuffer   errorBuff = new StringBuffer  ();
852             ErrorHandler errorHandler = new ErrorHandler(errorBuff);
853             documentBuilder.setErrorHandler(errorHandler);
854             // documentBuilder.init();
855             java.io.InputStream   input = new java.io.ByteArrayInputStream  (incoming.getBytes(CODING));
856             documentBuilder.parse(input);
857 
858             if (!resolver.hasDTD()) {
859                 throw new FormatException("no doc-type specified for the xml");
860             }
861             if (errorHandler.errorOrWarning) {
862                 throw new FormatException("error in xml: \n" + errorBuff.toString());
863             }
864         } catch (javax.xml.parsers.ParserConfigurationException   pce) {
865             throw new FormatException("[sax parser] not well formed xml: " + pce.toString());
866         } catch (org.xml.sax.SAXException   se) {
867             log.debug("", se);
868             //throw new FormatException("[sax] not well formed xml: "+se.toString() + "("+se.getMessage()+")");
869         } catch (java.io.IOException   ioe) {
870             throw new FormatException("[io] not well formed xml: " + ioe.toString());
871         }
872     }
873 
874     protected static class FormatException extends java.lang.Exception   {
875         FormatException(String   msg) {
876             super(msg);
877         }
878     }
879 
880     // Catch any errors or warnings,....
881     static class ErrorHandler implements org.xml.sax.ErrorHandler   {
882         boolean errorOrWarning;
883         StringBuffer   errorBuff;
884 
885         ErrorHandler(StringBuffer   errorBuff) {
886             super();
887             this.errorBuff = errorBuff;
888             errorOrWarning = false;
889         }
890 
891         // all methods from org.xml.sax.ErrorHandler
892         // from org.xml.sax.ErrorHandler
893         public void fatalError(org.xml.sax.SAXParseException   exc) {
894             errorBuff.append("FATAL[" + getLocationString(exc) + "]:" + exc.getMessage() + "\n");
895             errorOrWarning = true;
896         }
897 
898         // from org.xml.sax.ErrorHandler
899         public void error(org.xml.sax.SAXParseException   exc) {
900             errorBuff.append("Error[" + getLocationString(exc) + "]: " + exc.getMessage() + "\n");
901             errorOrWarning = true;
902         }
903 
904         // from org.xml.sax.ErrorHandler
905         public void warning(org.xml.sax.SAXParseException   exc) {
906             errorBuff.append("Warning[" + getLocationString(exc) + "]:" + exc.getMessage() + "\n");
907             errorOrWarning = true;
908         }
909 
910         // helper methods
911         /**
912          * Returns a string of the location.
913          */
914         private String   getLocationString(org.xml.sax.SAXParseException   ex) {
915             StringBuffer   str = new StringBuffer  ();
916             String   systemId = ex.getSystemId();
917             if (systemId != null) {
918                 int index = systemId.lastIndexOf('/');
919                 if (index != -1) {
920                     systemId = systemId.substring(index + 1);
921                 }
922                 str.append(systemId);
923             }
924             str.append(" line:");
925             str.append(ex.getLineNumber());
926             str.append(" column:");
927             str.append(ex.getColumnNumber());
928             return str.toString();
929         }
930     }
931 
932     public XmlField() {
933         super();
934     }
935     public XmlField(int to) {
936         super(to);
937     }
938 
939     public Map   transformers() {
940         Map   h = new HashMap  ();
941         h.put("MMXF_ASCII", new Config(XmlField.class, ASCII, "Converts xml to ASCII (cannoted be reversed)"));
942         h.put("MMXF_BODY_RICH", new Config(XmlField.class, RICHBODY, "XHTML 2 compliant XML."));
943         h.put("MMXF_BODY_POOR", new Config(XmlField.class, POORBODY, "XHTML 2 compliant XML, but withough <br/> tags"));
944         h.put("MMXF_HTML_INLINE", new Config(XmlField.class, HTML_INLINE, "Decodes only escaping and with <em>"));
945         h.put("MMXF_HTML_BLOCK", new Config(XmlField.class,  HTML_BLOCK, "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>"));
946         h.put("MMXF_HTML_BLOCK_BR", new Config(XmlField.class,  HTML_BLOCK_BR, "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>"));
947         h.put("MMXF_HTML_BLOCK_NOSURROUNDINGP", new Config(XmlField.class,  HTML_BLOCK_NOSURROUNDINGP, "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>"));
948         h.put("MMXF_HTML_BLOCK_BR_NOSURROUNDINGP", new Config(XmlField.class,  HTML_BLOCK_BR_NOSURROUNDINGP, "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>"));
949         h.put("MMXF_HTML_BLOCK_LIST", new Config(XmlField.class,  HTML_BLOCK_LIST, "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>, with <ul> inside the <p>"));
950         h.put("MMXF_HTML_BLOCK_LIST_NOSURROUNDINGP", new Config(XmlField.class,  HTML_BLOCK_LIST_NOSURROUNDINGP, "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>, with <ul> inside the <p>"));
951         h.put("MMXF_HTML_BLOCK_LIST_BR", new Config(XmlField.class,  HTML_BLOCK_LIST_BR, "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>, with <ul> inside the <p>"));
952         h.put("MMXF_HTML_BLOCK_LIST_BR_NOSURROUNDINGP", new Config(XmlField.class,  HTML_BLOCK_LIST_BR_NOSURROUNDINGP, "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>, with <ul> inside the <p>"));
953         h.put("MMXF_XHTML", new Config(XmlField.class, XHTML, "Converts to piece of XHTML"));
954         return h;
955     }
956 
957     public String   transform(String   data) {
958         switch (to) {
959         case RICHBODY :
960         case POORBODY :
961             throw new UnsupportedOperationException  ();
962             // XXXX
963             // needing richtext xslt here.
964             //return XSLTransform("mmxf2rich.xslt", XML_TAGSTART + data + XML_TAGEND);
965         case ASCII :
966             return XSLTransform("text.xslt", data);
967         case HTML_BLOCK:
968         case HTML_BLOCK_BR:
969         case HTML_INLINE:
970             throw new UnsupportedOperationException  ("Cannot transform");
971         default :
972             throw new UnknownCodingException(getClass(), to);
973         }
974     }
975 
976     public String   transformBack(String   r) {
977         String   result = null;
978         switch (to) {
979         case RICHBODY :
980             result = richToXML(r);
981             // rich will not be validated... Cannot be used yet!!
982             break;
983         case POORBODY :
984             result = poorToXML(r);
985             break;
986         case HTML_BLOCK:
987             result = richToHTMLBlock(r, false, true, true);
988             break;
989         case HTML_BLOCK_BR:
990             result = richToHTMLBlock(r, true, true, true);
991             break;
992         case HTML_BLOCK_NOSURROUNDINGP:
993             result = richToHTMLBlock(r, false, false, true);
994             break;
995         case HTML_BLOCK_BR_NOSURROUNDINGP:
996             result = richToHTMLBlock(r, true, false, true);
997             break;
998 
999         case HTML_BLOCK_LIST:
1000            result = richToHTMLBlock(r, false, true, false);
1001            break;
1002        case HTML_BLOCK_LIST_BR:
1003            result = richToHTMLBlock(r, true, true, false);
1004            break;
1005        case HTML_BLOCK_LIST_NOSURROUNDINGP:
1006            result = richToHTMLBlock(r, false, false, false);
1007            break;
1008        case HTML_BLOCK_LIST_BR_NOSURROUNDINGP:
1009            result = richToHTMLBlock(r, true, false, false);
1010            break;
1011
1012        case HTML_INLINE:
1013            result = poorToHTMLInline(r);
1014            break;
1015        case ASCII :
1016            throw new UnsupportedOperationException  ("Cannot transform");
1017        default :
1018            throw new UnknownCodingException(getClass(), to);
1019        }
1020        return result;
1021    }
1022
1023    public String   getEncoding() {
1024        switch (to) {
1025        case RICHBODY :
1026            return "MMXF_BODY_RICH";
1027        case POORBODY :
1028            return "MMXF_BODY_POOR";
1029        case HTML_BLOCK :
1030            return "MMXF_HTML_BLOCK";
1031        case HTML_BLOCK_BR :
1032            return "MMXF_HTML_BLOCK_BR";
1033        case HTML_BLOCK_NOSURROUNDINGP :
1034            return "MMXF_HTML_BLOCK_NOSURROUNDINGP";
1035        case HTML_BLOCK_BR_NOSURROUNDINGP :
1036            return "MMXF_HTML_BLOCK_BR_NOSURROUNDINGP";
1037        case HTML_BLOCK_LIST :
1038            return "MMXF_HTML_BLOCK_LIST";
1039        case HTML_BLOCK_LIST_BR :
1040            return "MMXF_HTML_BLOCK_LIST_BR";
1041        case HTML_BLOCK_LIST_NOSURROUNDINGP :
1042            return "MMXF_HTML_BLOCK_LIST_NOSURROUNDINGP";
1043        case HTML_BLOCK_LIST_BR_NOSURROUNDINGP :
1044            return "MMXF_HTML_BLOCK_LIST_BR_NOSURROUNDINGP";
1045        case HTML_INLINE :
1046            return "MMXF_HTML_INLINE";
1047        case ASCII :
1048            return "MMXF_ASCII";
1049        default :
1050            throw new UnknownCodingException(getClass(), to);
1051        }
1052    }
1053}
1054
A to Z: JavaDoc & Examples Daily Java News & Articles Open Source Projects Open Source Codes Free Computer Books Remove Frame
Popular Tags