001/**
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.camel.support;
018
019import java.io.Closeable;
020import java.io.IOException;
021import java.io.InputStream;
022import java.text.MessageFormat;
023import java.util.ArrayList;
024import java.util.Iterator;
025import java.util.LinkedHashMap;
026import java.util.List;
027import java.util.Map;
028import java.util.Scanner;
029import java.util.regex.MatchResult;
030import java.util.regex.Matcher;
031import java.util.regex.Pattern;
032
033import org.apache.camel.Exchange;
034import org.apache.camel.InvalidPayloadException;
035import org.apache.camel.util.IOHelper;
036import org.apache.camel.util.ObjectHelper;
037
038/**
039 * {@link org.apache.camel.Expression} to walk a {@link org.apache.camel.Message} XML body
040 * using an {@link java.util.Iterator}, which grabs the content between a XML start and end token,
041 * where the end token corresponds implicitly to either the end tag or the self-closing start tag.
042 * <p/>
043 * The message body must be able to convert to {@link java.io.InputStream} type which is used as stream
044 * to access the message body.
045 * <p/>
046 * Can be used to split big XML files.
047 * <p/>
048 * This implementation supports inheriting namespaces from a parent/root tag.
049 */
050public class TokenXMLExpressionIterator extends ExpressionAdapter {
051    private static final Pattern NAMESPACE_PATTERN = Pattern.compile("xmlns(:\\w+|)\\s*=\\s*('[^']+'|\"[^\"]+\")");
052    private static final String SCAN_TOKEN_NS_PREFIX_REGEX = "([^:<>]{1,15}?:|)";
053    private static final String SCAN_BLOCK_TOKEN_REGEX_TEMPLATE = "<{0}(\\s+[^>]*)?/>|<{0}(\\s+[^>]*)?>(?:(?!(</{0}\\s*>)).)*</{0}\\s*>";
054    private static final String SCAN_PARENT_TOKEN_REGEX_TEMPLATE = "<{0}(\\s+[^>]*\\s*)?>";
055    private static final String OPTION_WRAP_TOKEN = "<*>";
056
057    protected final String tagToken;
058    protected final String inheritNamespaceToken;
059
060    public TokenXMLExpressionIterator(String tagToken, String inheritNamespaceToken) {
061        ObjectHelper.notEmpty(tagToken, "tagToken");
062        this.tagToken = tagToken;
063        // namespace token is optional
064        this.inheritNamespaceToken = inheritNamespaceToken;
065
066        // must be XML tokens
067        if (!tagToken.startsWith("<") || !tagToken.endsWith(">")) {
068            throw new IllegalArgumentException("XML Tag token must be a valid XML tag, was: " + tagToken);
069        }
070        if (inheritNamespaceToken != null && (!inheritNamespaceToken.startsWith("<") || !inheritNamespaceToken.endsWith(">"))) {
071            throw new IllegalArgumentException("Namespace token must be a valid XML token, was: " + inheritNamespaceToken);
072        }
073    }
074
075    protected Iterator<?> createIterator(InputStream in, String charset) {
076        XMLTokenIterator iterator = new XMLTokenIterator(tagToken, inheritNamespaceToken, in, charset);
077        iterator.init();
078        return iterator;
079    }
080
081    @Override
082    public boolean matches(Exchange exchange) {
083        // as a predicate we must close the stream, as we do not return an iterator that can be used
084        // afterwards to iterate the input stream
085        Object value = doEvaluate(exchange, true);
086        return ObjectHelper.evaluateValuePredicate(value);
087    }
088
089    @Override
090    public Object evaluate(Exchange exchange) {
091        // as we return an iterator to access the input stream, we should not close it
092        return doEvaluate(exchange, false);
093    }
094
095    /**
096     * Strategy to evaluate the exchange
097     *
098     * @param exchange   the exchange
099     * @param closeStream whether to close the stream before returning from this method.
100     * @return the evaluated value
101     */
102    protected Object doEvaluate(Exchange exchange, boolean closeStream) {
103        InputStream in = null;
104        try {
105            in = exchange.getIn().getMandatoryBody(InputStream.class);
106            // we may read from a file, and want to support custom charset defined on the exchange
107            String charset = IOHelper.getCharsetName(exchange);
108            return createIterator(in, charset);
109        } catch (InvalidPayloadException e) {
110            exchange.setException(e);
111            // must close input stream
112            IOHelper.close(in);
113            return null;
114        } finally {
115            if (closeStream) {
116                IOHelper.close(in);
117            }
118        }
119    }
120    
121    /**
122     * Iterator to walk the input stream
123     */
124    static class XMLTokenIterator implements Iterator<Object>, Closeable {
125        final String tagToken;
126        final InputStream in;
127        final String charset;
128        Scanner scanner;
129        Object image;
130
131        private final Pattern tagTokenPattern;
132        private final String inheritNamespaceToken;
133        private final boolean wrapToken;
134        private Pattern inheritNamespaceTokenPattern;
135        private String rootTokenNamespaces;
136        private String wrapHead;
137        private String wrapTail;
138
139        XMLTokenIterator(String tagToken, String inheritNamespaceToken, InputStream in, String charset) {
140            this.tagToken = tagToken;
141            this.charset = charset;
142          
143            // remove any beginning < and ending > as we need to support ns prefixes and attributes, so we use a reg exp patterns
144            this.tagTokenPattern = 
145                Pattern.compile(MessageFormat.format(SCAN_BLOCK_TOKEN_REGEX_TEMPLATE, 
146                                                     SCAN_TOKEN_NS_PREFIX_REGEX + tagToken.substring(1, tagToken.length() - 1)), 
147                                                     Pattern.MULTILINE | Pattern.DOTALL);
148            
149            this.inheritNamespaceToken = inheritNamespaceToken;
150            if (inheritNamespaceToken != null && OPTION_WRAP_TOKEN.equals(inheritNamespaceToken)) {
151                this.wrapToken = true;
152                this.in = new RecordableInputStream(in, charset);
153            } else {
154                this.wrapToken = false;
155                this.in = in;
156                if (inheritNamespaceToken != null) {
157                    // the inherit namespace token may itself have a namespace prefix
158                    // the namespaces on the parent tag can be in multi line, so we need to instruct the dot to support multilines
159                    this.inheritNamespaceTokenPattern = 
160                        Pattern.compile(MessageFormat.format(SCAN_PARENT_TOKEN_REGEX_TEMPLATE,
161                                                             SCAN_TOKEN_NS_PREFIX_REGEX + inheritNamespaceToken.substring(1, inheritNamespaceToken.length() - 1)), 
162                                                             Pattern.MULTILINE | Pattern.DOTALL);
163                }
164            }
165        }
166
167        void init() {
168            // use a scanner with the default delimiter
169            this.scanner = new Scanner(in, charset);
170            this.image = scanner.hasNext() ? (String) next(true) : null;
171        }
172
173        String getNext(boolean first) {
174            // initialize inherited namespaces on first
175            if (first && inheritNamespaceToken != null && !wrapToken) {
176                rootTokenNamespaces =  getNamespacesFromNamespaceToken(scanner.findWithinHorizon(inheritNamespaceTokenPattern, 0));
177            }
178
179            String next = scanner.findWithinHorizon(tagTokenPattern, 0);
180            if (next == null) {
181                return null;
182            }
183            if (first && wrapToken) {
184                MatchResult mres = scanner.match();
185                wrapHead = ((RecordableInputStream)in).getText(mres.start());
186                wrapTail = buildXMLTail(wrapHead);
187            }
188
189            // build answer accordingly to whether namespaces should be inherited or not
190            if (inheritNamespaceToken != null && rootTokenNamespaces != null) {
191                // REVISIT should skip the prefixes that are declared within the child itself.
192                String head = ObjectHelper.before(next, ">");
193                boolean empty = false;
194                if (head.endsWith("/")) {
195                    head = head.substring(0, head.length() - 1);
196                    empty = true;
197                }
198                StringBuilder sb = new StringBuilder();
199                // append root namespaces to local start token
200                // grab the text
201                String tail = ObjectHelper.after(next, ">");
202                // build result with inherited namespaces
203                next = sb.append(head).append(rootTokenNamespaces).append(empty ? "/>" : ">").append(tail).toString();
204            } else if (wrapToken) {
205                // wrap the token
206                StringBuilder sb = new StringBuilder();
207                next = sb.append(wrapHead).append(next).append(wrapTail).toString();
208            }
209            
210            return next;
211        }
212
213        private String getNamespacesFromNamespaceToken(String text) {
214            if (text == null) {
215                return null;
216            }
217
218            // find namespaces (there can be attributes mixed, so we should only grab the namespaces)
219            Map<String, String> namespaces = new LinkedHashMap<String, String>();
220            Matcher matcher = NAMESPACE_PATTERN.matcher(text);
221            while (matcher.find()) {
222                String prefix = matcher.group(1);
223                String url = matcher.group(2);
224                if (ObjectHelper.isEmpty(prefix)) {
225                    prefix = "_DEFAULT_";
226                } else {
227                    // skip leading :
228                    prefix = prefix.substring(1);
229                }
230                namespaces.put(prefix, url);
231            }
232
233            // did we find any namespaces
234            if (namespaces.isEmpty()) {
235                return null;
236            }
237
238            // build namespace String
239            StringBuilder sb = new StringBuilder();
240            for (Map.Entry<String, String> entry : namespaces.entrySet()) {
241                String key = entry.getKey();
242                // note the value is already quoted
243                String value = entry.getValue();
244                if ("_DEFAULT_".equals(key)) {
245                    sb.append(" xmlns=").append(value);
246                } else {
247                    sb.append(" xmlns:").append(key).append("=").append(value);
248                }
249            }
250
251            return sb.toString();
252        }
253        
254        @Override
255        public boolean hasNext() {
256            return image != null;
257        }
258
259        @Override
260        public Object next() {
261            return next(false);
262        }
263
264        Object next(boolean first) {
265            Object answer = image;
266            // calculate next
267            if (scanner.hasNext()) {
268                image = getNext(first);
269            } else {
270                image = null;
271            }
272
273            if (answer == null) {
274                // first time the image may be null
275                answer = image;
276            }
277            return answer;
278        }
279
280        @Override
281        public void remove() {
282            // noop
283        }
284
285        @Override
286        public void close() throws IOException {
287            scanner.close();
288        }
289
290    }
291
292    private static String buildXMLTail(String xmlhead) {
293        // assume the input text is a portion of a well-formed xml
294        List<String> tags = new ArrayList<String>();
295        int p = 0;
296        while (p < xmlhead.length()) {
297            p = xmlhead.indexOf('<', p);
298            if (p < 0) {
299                break;
300            }
301            int nc = xmlhead.charAt(p + 1); 
302            if (nc == '?') {
303                p++;
304                continue;
305            } else if (nc == '/') {
306                p++;
307                tags.remove(tags.size() - 1);
308            } else {
309                final int ep = xmlhead.indexOf('>', p);
310                if (xmlhead.charAt(ep - 1) == '/') {
311                    p++;
312                    continue;
313                }
314                final int sp = xmlhead.substring(p, ep).indexOf(' ');
315                tags.add(xmlhead.substring(p + 1, sp > 0 ? p + sp : ep));
316                p = ep;
317            }
318        }
319        StringBuilder sb = new StringBuilder();
320        for (int i = tags.size() - 1; i >= 0; i--) {
321            sb.append("</").append(tags.get(i)).append(">");
322        }
323        return sb.toString();
324    }
325}