001/** 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.camel.util; 018 019import java.util.ArrayList; 020import java.util.BitSet; 021import java.util.List; 022import java.util.regex.Matcher; 023import java.util.regex.Pattern; 024 025/** 026 * Encoder for unsafe URI characters. 027 * <p/> 028 * A good source for details is <a href="http://en.wikipedia.org/wiki/Url_encode">wikipedia url encode</a> article. 029 */ 030public final class UnsafeUriCharactersEncoder { 031 private static BitSet unsafeCharactersRfc1738; 032 private static BitSet unsafeCharactersHttp; 033 private static final char[] HEX_DIGITS = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 034 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f'}; 035 private static final Pattern RAW_PATTERN = Pattern.compile("RAW\\([^\\)]+\\)"); 036 037 static { 038 unsafeCharactersRfc1738 = new BitSet(256); 039 unsafeCharactersRfc1738.set(' '); 040 unsafeCharactersRfc1738.set('"'); 041 unsafeCharactersRfc1738.set('<'); 042 unsafeCharactersRfc1738.set('>'); 043 unsafeCharactersRfc1738.set('#'); 044 unsafeCharactersRfc1738.set('%'); 045 unsafeCharactersRfc1738.set('{'); 046 unsafeCharactersRfc1738.set('}'); 047 unsafeCharactersRfc1738.set('|'); 048 unsafeCharactersRfc1738.set('\\'); 049 unsafeCharactersRfc1738.set('^'); 050 unsafeCharactersRfc1738.set('~'); 051 unsafeCharactersRfc1738.set('['); 052 unsafeCharactersRfc1738.set(']'); 053 unsafeCharactersRfc1738.set('`'); 054 } 055 056 static { 057 unsafeCharactersHttp = new BitSet(256); 058 unsafeCharactersHttp.set(' '); 059 unsafeCharactersHttp.set('"'); 060 unsafeCharactersHttp.set('<'); 061 unsafeCharactersHttp.set('>'); 062 unsafeCharactersHttp.set('#'); 063 unsafeCharactersHttp.set('%'); 064 unsafeCharactersHttp.set('{'); 065 unsafeCharactersHttp.set('}'); 066 unsafeCharactersHttp.set('|'); 067 unsafeCharactersHttp.set('\\'); 068 unsafeCharactersHttp.set('^'); 069 unsafeCharactersHttp.set('~'); 070 unsafeCharactersHttp.set('`'); 071 } 072 073 private UnsafeUriCharactersEncoder() { 074 // util class 075 } 076 077 public static String encode(String s) { 078 return encode(s, unsafeCharactersRfc1738); 079 } 080 081 public static String encodeHttpURI(String s) { 082 return encode(s, unsafeCharactersHttp); 083 } 084 085 public static String encode(String s, BitSet unsafeCharacters) { 086 return encode(s, unsafeCharacters, false); 087 } 088 089 public static String encode(String s, boolean checkRaw) { 090 return encode(s, unsafeCharactersRfc1738, checkRaw); 091 } 092 093 public static String encodeHttpURI(String s, boolean checkRaw) { 094 return encode(s, unsafeCharactersHttp, checkRaw); 095 } 096 097 private static List<Pair> checkRAW(String s) { 098 Matcher matcher = RAW_PATTERN.matcher(s); 099 List<Pair> answer = new ArrayList<>(); 100 // Check all occurrences 101 while (matcher.find()) { 102 // TODO: should likely be matcher.end() - 1 103 answer.add(new Pair(matcher.start(), matcher.end())); 104 } 105 return answer; 106 } 107 108 private static boolean isRaw(int index, List<Pair>pairs) { 109 for (Pair pair : pairs) { 110 if (index < pair.left) { 111 return false; 112 } else { 113 if (index >= pair.left) { 114 if (index <= pair.right) { 115 return true; 116 } else { 117 continue; 118 } 119 } 120 } 121 } 122 return false; 123 } 124 125 private static class Pair { 126 int left; 127 int right; 128 Pair(int left, int right) { 129 this.left = left; 130 this.right = right; 131 } 132 } 133 134 // Just skip the encode for isRAW part 135 public static String encode(String s, BitSet unsafeCharacters, boolean checkRaw) { 136 List<Pair> rawPairs; 137 if (checkRaw) { 138 rawPairs = checkRAW(s); 139 } else { 140 rawPairs = new ArrayList<>(); 141 } 142 143 int n = s == null ? 0 : s.length(); 144 if (n == 0) { 145 return s; 146 } 147 148 // First check whether we actually need to encode 149 char chars[] = s.toCharArray(); 150 for (int i = 0;;) { 151 // just deal with the ascii character 152 if (chars[i] > 0 && chars[i] < 128) { 153 if (unsafeCharacters.get(chars[i])) { 154 break; 155 } 156 } 157 if (++i >= chars.length) { 158 return s; 159 } 160 } 161 162 // okay there are some unsafe characters so we do need to encode 163 // see details at: http://en.wikipedia.org/wiki/Url_encode 164 StringBuilder sb = new StringBuilder(); 165 for (int i = 0; i < chars.length; i++) { 166 char ch = chars[i]; 167 if (ch > 0 && ch < 128 && unsafeCharacters.get(ch)) { 168 // special for % sign as it may be a decimal encoded value 169 if (ch == '%') { 170 char next = i + 1 < chars.length ? chars[i + 1] : ' '; 171 char next2 = i + 2 < chars.length ? chars[i + 2] : ' '; 172 173 if (isHexDigit(next) && isHexDigit(next2) && !isRaw(i, rawPairs)) { 174 // its already encoded (decimal encoded) so just append as is 175 sb.append(ch); 176 } else { 177 // must escape then, as its an unsafe character 178 appendEscape(sb, (byte)ch); 179 } 180 } else { 181 // must escape then, as its an unsafe character 182 appendEscape(sb, (byte)ch); 183 } 184 } else { 185 sb.append(ch); 186 } 187 } 188 return sb.toString(); 189 } 190 191 private static void appendEscape(StringBuilder sb, byte b) { 192 sb.append('%'); 193 sb.append(HEX_DIGITS[(b >> 4) & 0x0f]); 194 sb.append(HEX_DIGITS[(b >> 0) & 0x0f]); 195 } 196 197 private static boolean isHexDigit(char ch) { 198 for (char hex : HEX_DIGITS) { 199 if (hex == ch) { 200 return true; 201 } 202 } 203 return false; 204 } 205 206}