001/**
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.camel.util;
018
019import java.util.ArrayList;
020import java.util.BitSet;
021import java.util.List;
022import java.util.regex.Matcher;
023import java.util.regex.Pattern;
024
025/**
026 * Encoder for unsafe URI characters.
027 * <p/>
028 * A good source for details is <a href="http://en.wikipedia.org/wiki/Url_encode">wikipedia url encode</a> article.
029 */
030public final class UnsafeUriCharactersEncoder {
031    private static BitSet unsafeCharactersRfc1738;
032    private static BitSet unsafeCharactersHttp;
033    private static final char[] HEX_DIGITS = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C',
034                                              'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'};
035    private static final Pattern RAW_PATTERN = Pattern.compile("RAW\\([^\\)]+\\)");
036
037    static {
038        unsafeCharactersRfc1738 = new BitSet(256);
039        unsafeCharactersRfc1738.set(' ');
040        unsafeCharactersRfc1738.set('"');
041        unsafeCharactersRfc1738.set('<');
042        unsafeCharactersRfc1738.set('>');
043        unsafeCharactersRfc1738.set('#');
044        unsafeCharactersRfc1738.set('%');
045        unsafeCharactersRfc1738.set('{');
046        unsafeCharactersRfc1738.set('}');
047        unsafeCharactersRfc1738.set('|');
048        unsafeCharactersRfc1738.set('\\');
049        unsafeCharactersRfc1738.set('^');
050        unsafeCharactersRfc1738.set('~');
051        unsafeCharactersRfc1738.set('[');
052        unsafeCharactersRfc1738.set(']');
053        unsafeCharactersRfc1738.set('`');
054    }
055    
056    static {
057        unsafeCharactersHttp = new BitSet(256);
058        unsafeCharactersHttp.set(' ');
059        unsafeCharactersHttp.set('"');
060        unsafeCharactersHttp.set('<');
061        unsafeCharactersHttp.set('>');
062        unsafeCharactersHttp.set('#');
063        unsafeCharactersHttp.set('%');
064        unsafeCharactersHttp.set('{');
065        unsafeCharactersHttp.set('}');
066        unsafeCharactersHttp.set('|');
067        unsafeCharactersHttp.set('\\');
068        unsafeCharactersHttp.set('^');
069        unsafeCharactersHttp.set('~');
070        unsafeCharactersHttp.set('`');
071    }
072
073    private UnsafeUriCharactersEncoder() {
074        // util class
075    }
076
077    public static String encode(String s) {
078        return encode(s, unsafeCharactersRfc1738);
079    }
080    
081    public static String encodeHttpURI(String s) {
082        return encode(s, unsafeCharactersHttp);
083    }
084    
085    public static String encode(String s, BitSet unsafeCharacters) {
086        return encode(s, unsafeCharacters, false);
087    }
088    
089    public static String encode(String s, boolean checkRaw) {
090        return encode(s, unsafeCharactersRfc1738, checkRaw);
091    }
092    
093    public static String encodeHttpURI(String s, boolean checkRaw) {
094        return encode(s, unsafeCharactersHttp, checkRaw);
095    }
096
097    private static List<Pair> checkRAW(String s) {
098        Matcher matcher = RAW_PATTERN.matcher(s);
099        List<Pair> answer = new ArrayList<>();
100        // Check all occurrences
101        while (matcher.find()) {
102            // TODO: should likely be matcher.end() - 1
103            answer.add(new Pair(matcher.start(), matcher.end()));
104        }
105        return answer;
106    }
107    
108    private static boolean isRaw(int index, List<Pair>pairs) {
109        for (Pair pair : pairs) {
110            if (index < pair.left) {
111                return false;
112            } else {
113                if (index >= pair.left) {
114                    if (index <= pair.right) {
115                        return true;
116                    } else {
117                        continue;
118                    }
119                }
120            }
121        }
122        return false;
123    }
124    
125    private static class Pair {
126        int left;
127        int right;
128        Pair(int left, int right) {
129            this.left = left;
130            this.right = right;
131        }
132    }
133    
134    // Just skip the encode for isRAW part
135    public static String encode(String s, BitSet unsafeCharacters, boolean checkRaw) {
136        List<Pair> rawPairs;
137        if (checkRaw) {
138            rawPairs = checkRAW(s); 
139        } else {
140            rawPairs = new ArrayList<>();
141        }
142   
143        int n = s == null ? 0 : s.length();
144        if (n == 0) {
145            return s;
146        }
147
148        // First check whether we actually need to encode
149        char chars[] = s.toCharArray();
150        for (int i = 0;;) {
151            // just deal with the ascii character
152            if (chars[i] > 0 && chars[i] < 128) {
153                if (unsafeCharacters.get(chars[i])) {
154                    break;
155                }
156            }
157            if (++i >= chars.length) {
158                return s;
159            }
160        }
161
162        // okay there are some unsafe characters so we do need to encode
163        // see details at: http://en.wikipedia.org/wiki/Url_encode
164        StringBuilder sb = new StringBuilder();
165        for (int i = 0; i < chars.length; i++) {
166            char ch = chars[i];
167            if (ch > 0 && ch < 128 && unsafeCharacters.get(ch)) {
168                // special for % sign as it may be a decimal encoded value
169                if (ch == '%') {
170                    char next = i + 1 < chars.length ? chars[i + 1] : ' ';
171                    char next2 = i + 2 < chars.length ? chars[i + 2] : ' ';
172
173                    if (isHexDigit(next) && isHexDigit(next2) && !isRaw(i, rawPairs)) {
174                        // its already encoded (decimal encoded) so just append as is
175                        sb.append(ch);
176                    } else {
177                        // must escape then, as its an unsafe character
178                        appendEscape(sb, (byte)ch);
179                    }
180                } else {
181                    // must escape then, as its an unsafe character
182                    appendEscape(sb, (byte)ch);
183                }
184            } else {
185                sb.append(ch);
186            }
187        }
188        return sb.toString();
189    }
190
191    private static void appendEscape(StringBuilder sb, byte b) {
192        sb.append('%');
193        sb.append(HEX_DIGITS[(b >> 4) & 0x0f]);
194        sb.append(HEX_DIGITS[(b >> 0) & 0x0f]);
195    }
196
197    private static boolean isHexDigit(char ch) {
198        for (char hex : HEX_DIGITS) {
199            if (hex == ch) {
200                return true;
201            }
202        }
203        return false;
204    }
205
206}