001/**
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.camel.util;
018
019import java.util.ArrayList;
020import java.util.BitSet;
021import java.util.List;
022import java.util.regex.Matcher;
023import java.util.regex.Pattern;
024
025/**
026 * Encoder for unsafe URI characters.
027 * <p/>
028 * A good source for details is <a href="http://en.wikipedia.org/wiki/Url_encode">wikipedia url encode</a> article.
029 */
030public final class UnsafeUriCharactersEncoder {
031    private static BitSet unsafeCharactersRfc1738;
032    private static BitSet unsafeCharactersHttp;
033    private static final char[] HEX_DIGITS = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C',
034                                              'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'};
035
036    static {
037        unsafeCharactersRfc1738 = new BitSet(256);
038        unsafeCharactersRfc1738.set(' ');
039        unsafeCharactersRfc1738.set('"');
040        unsafeCharactersRfc1738.set('<');
041        unsafeCharactersRfc1738.set('>');
042        unsafeCharactersRfc1738.set('#');
043        unsafeCharactersRfc1738.set('%');
044        unsafeCharactersRfc1738.set('{');
045        unsafeCharactersRfc1738.set('}');
046        unsafeCharactersRfc1738.set('|');
047        unsafeCharactersRfc1738.set('\\');
048        unsafeCharactersRfc1738.set('^');
049        unsafeCharactersRfc1738.set('~');
050        unsafeCharactersRfc1738.set('[');
051        unsafeCharactersRfc1738.set(']');
052        unsafeCharactersRfc1738.set('`');
053    }
054    
055    static {
056        unsafeCharactersHttp = new BitSet(256);
057        unsafeCharactersHttp.set(' ');
058        unsafeCharactersHttp.set('"');
059        unsafeCharactersHttp.set('<');
060        unsafeCharactersHttp.set('>');
061        unsafeCharactersHttp.set('#');
062        unsafeCharactersHttp.set('%');
063        unsafeCharactersHttp.set('{');
064        unsafeCharactersHttp.set('}');
065        unsafeCharactersHttp.set('|');
066        unsafeCharactersHttp.set('\\');
067        unsafeCharactersHttp.set('^');
068        unsafeCharactersHttp.set('~');
069        unsafeCharactersHttp.set('`');
070    }
071
072    private UnsafeUriCharactersEncoder() {
073        // util class
074    }
075
076    public static String encode(String s) {
077        return encode(s, unsafeCharactersRfc1738);
078    }
079    
080    public static String encodeHttpURI(String s) {
081        return encode(s, unsafeCharactersHttp);
082    }
083    
084    public static String encode(String s, BitSet unsafeCharacters) {
085        return encode(s, unsafeCharacters, false);
086    }
087    
088    public static String encode(String s, boolean checkRaw) {
089        return encode(s, unsafeCharactersRfc1738, checkRaw);
090    }
091    
092    public static String encodeHttpURI(String s, boolean checkRaw) {
093        return encode(s, unsafeCharactersHttp, checkRaw);
094    }
095    
096    private static List<Pair> checkRAW(String s) {
097        Pattern pattern = Pattern.compile("RAW\\([^\\)]+\\)");
098        Matcher matcher = pattern.matcher(s);
099        List<Pair> answer = new ArrayList<Pair>();
100        // Check all occurrences
101        while (matcher.find()) {
102            answer.add(new Pair(matcher.start(), matcher.end()));
103        }
104        return answer;
105    }
106    
107    private static boolean isRaw(int index, List<Pair>pairs) {
108        for (Pair pair : pairs) {
109            if (index < pair.left) {
110                return false;
111            } else {
112                if (index >= pair.left) {
113                    if (index <= pair.right) {
114                        return true;
115                    } else {
116                        continue;
117                    }
118                }
119            }
120        }
121        return false;
122    }
123    
124    private static class Pair {
125        int left;
126        int right;
127        Pair(int left, int right) {
128            this.left = left;
129            this.right = right;
130        }
131    }
132    
133    // Just skip the encode for isRAW part
134    public static String encode(String s, BitSet unsafeCharacters, boolean checkRaw) {
135        List<Pair> rawPairs;
136        if (checkRaw) {
137            rawPairs = checkRAW(s); 
138        } else {
139            rawPairs = new ArrayList<Pair>();
140        }
141   
142        int n = s == null ? 0 : s.length();
143        if (n == 0) {
144            return s;
145        }
146
147        // First check whether we actually need to encode
148        char chars[] = s.toCharArray();
149        for (int i = 0;;) {
150            // just deal with the ascii character
151            if (chars[i] > 0 && chars[i] < 128) {
152                if (unsafeCharacters.get(chars[i])) {
153                    break;
154                }
155            }
156            if (++i >= chars.length) {
157                return s;
158            }
159        }
160
161        // okay there are some unsafe characters so we do need to encode
162        // see details at: http://en.wikipedia.org/wiki/Url_encode
163        StringBuilder sb = new StringBuilder();
164        for (int i = 0; i < chars.length; i++) {
165            char ch = chars[i];
166            if (ch > 0 && ch < 128 && unsafeCharacters.get(ch)) {
167                // special for % sign as it may be a decimal encoded value
168                if (ch == '%') {
169                    char next = i + 1 < chars.length ? chars[i + 1] : ' ';
170                    char next2 = i + 2 < chars.length ? chars[i + 2] : ' ';
171
172                    if (isHexDigit(next) && isHexDigit(next2) && !isRaw(i, rawPairs)) {
173                        // its already encoded (decimal encoded) so just append as is
174                        sb.append(ch);
175                    } else {
176                        // must escape then, as its an unsafe character
177                        appendEscape(sb, (byte)ch);
178                    }
179                } else {
180                    // must escape then, as its an unsafe character
181                    appendEscape(sb, (byte)ch);
182                }
183            } else {
184                sb.append(ch);
185            }
186        }
187        return sb.toString();
188    }
189
190    private static void appendEscape(StringBuilder sb, byte b) {
191        sb.append('%');
192        sb.append(HEX_DIGITS[(b >> 4) & 0x0f]);
193        sb.append(HEX_DIGITS[(b >> 0) & 0x0f]);
194    }
195
196    private static boolean isHexDigit(char ch) {
197        for (char hex : HEX_DIGITS) {
198            if (hex == ch) {
199                return true;
200            }
201        }
202        return false;
203    }
204
205}