sql.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. // Unless explicitly stated otherwise all files in this repository are licensed
  2. // under the Apache License Version 2.0.
  3. // This product includes software developed at Datadog (https://www.datadoghq.com/).
  4. // Copyright 2016-present Datadog, Inc.
  5. package obfuscate
  6. import (
  7. "bytes"
  8. "errors"
  9. "fmt"
  10. "strings"
  11. "unicode"
  12. "unicode/utf8"
  13. )
  14. var questionMark = []byte("?")
  15. // discardFilter is a token filter which discards certain elements from a query, such as
  16. // comments and AS aliases by returning a nil buffer.
  17. type discardFilter struct{ keepSQLAlias bool }
  18. // Filter the given token so that a `nil` slice is returned if the token is in the token filtered list.
  19. func (f *discardFilter) Filter(token, lastToken TokenKind, buffer []byte) (TokenKind, []byte, error) {
  20. // filters based on previous token
  21. switch lastToken {
  22. case FilteredBracketedIdentifier:
  23. if token != ']' {
  24. // we haven't found the closing bracket yet, keep going
  25. if token != ID {
  26. // the token between the brackets *must* be an identifier,
  27. // otherwise the query is invalid.
  28. return LexError, nil, fmt.Errorf("expected identifier in bracketed filter, got %d", token)
  29. }
  30. return FilteredBracketedIdentifier, nil, nil
  31. }
  32. fallthrough
  33. case As:
  34. if token == '[' {
  35. // the identifier followed by AS is an MSSQL bracketed identifier
  36. // and will continue to be discarded until we find the corresponding
  37. // closing bracket counter-part. See GitHub issue DataDog/datadog-trace-agent#475.
  38. return FilteredBracketedIdentifier, nil, nil
  39. }
  40. if f.keepSQLAlias {
  41. return token, buffer, nil
  42. }
  43. return Filtered, nil, nil
  44. }
  45. // filters based on the current token; if the next token should be ignored,
  46. // return the same token value (not FilteredGroupable) and nil
  47. switch token {
  48. case Comment:
  49. return Filtered, nil, nil
  50. case ';':
  51. return markFilteredGroupable(token), nil, nil
  52. case As:
  53. if !f.keepSQLAlias {
  54. return As, nil, nil
  55. }
  56. fallthrough
  57. default:
  58. return token, buffer, nil
  59. }
  60. }
  61. // Reset implements tokenFilter.
  62. func (f *discardFilter) Reset() {}
  63. // replaceFilter is a token filter which obfuscates strings and numbers in queries by replacing them
  64. // with the "?" character.
  65. type replaceFilter struct {
  66. replaceDigits bool
  67. }
  68. // Filter the given token so that it will be replaced if in the token replacement list
  69. func (f *replaceFilter) Filter(token, lastToken TokenKind, buffer []byte) (tokenType TokenKind, tokenBytes []byte, err error) {
  70. switch lastToken {
  71. case Savepoint:
  72. return markFilteredGroupable(token), questionMark, nil
  73. case '=':
  74. switch token {
  75. case DoubleQuotedString:
  76. // double-quoted strings after assignments are eligible for obfuscation
  77. return markFilteredGroupable(token), questionMark, nil
  78. }
  79. }
  80. switch token {
  81. case DollarQuotedString, String, Number, Null, Variable, PreparedStatement, BooleanLiteral, EscapeSequence:
  82. return markFilteredGroupable(token), questionMark, nil
  83. case '?':
  84. // Cases like 'ARRAY [ ?, ? ]' should be collapsed into 'ARRAY [ ? ]'
  85. return markFilteredGroupable(token), questionMark, nil
  86. case TableName, ID:
  87. if f.replaceDigits {
  88. return token, replaceDigits(buffer), nil
  89. }
  90. fallthrough
  91. default:
  92. return token, buffer, nil
  93. }
  94. }
  95. // Reset implements tokenFilter.
  96. func (f *replaceFilter) Reset() {}
  97. // groupingFilter is a token filter which groups together items replaced by the replaceFilter. It is meant
  98. // to run immediately after it.
  99. type groupingFilter struct {
  100. groupFilter int // counts the number of values, e.g. 3 = ?, ?, ?
  101. groupMulti int // counts the number of groups, e.g. 2 = (?, ?), (?, ?, ?)
  102. }
  103. // Filter the given token so that it will be discarded if a grouping pattern
  104. // has been recognized. A grouping is composed by items like:
  105. // * '( ?, ?, ? )'
  106. // * '( ?, ? ), ( ?, ? )'
  107. func (f *groupingFilter) Filter(token, lastToken TokenKind, buffer []byte) (tokenType TokenKind, tokenBytes []byte, err error) {
  108. // increasing the number of groups means that we're filtering an entire group
  109. // because it can be represented with a single '( ? )'
  110. if (lastToken == '(' && isFilteredGroupable(token)) || (token == '(' && f.groupMulti > 0) {
  111. f.groupMulti++
  112. }
  113. switch {
  114. case f.groupMulti > 0 && lastToken == FilteredGroupableParenthesis && token == ID:
  115. // this is the start of a new group that seems to be a nested query;
  116. // cancel grouping.
  117. f.Reset()
  118. return token, append([]byte("( "), buffer...), nil
  119. case isFilteredGroupable(token):
  120. // the previous filter has dropped this token so we should start
  121. // counting the group filter so that we accept only one '?' for
  122. // the same group
  123. f.groupFilter++
  124. if f.groupFilter > 1 {
  125. return markFilteredGroupable(token), nil, nil
  126. }
  127. case f.groupFilter > 0 && (token == ',' || token == '?'):
  128. // if we are in a group drop all commas
  129. return markFilteredGroupable(token), nil, nil
  130. case f.groupMulti > 1:
  131. // drop all tokens since we're in a counting group
  132. // and they're duplicated
  133. return markFilteredGroupable(token), nil, nil
  134. case token != ',' && token != '(' && token != ')' && !isFilteredGroupable(token):
  135. // when we're out of a group reset the filter state
  136. f.Reset()
  137. }
  138. return token, buffer, nil
  139. }
  140. // isFilteredGroupable reports whether token is to be considered filtered groupable.
  141. func isFilteredGroupable(token TokenKind) bool {
  142. switch token {
  143. case FilteredGroupable, FilteredGroupableParenthesis:
  144. return true
  145. default:
  146. return false
  147. }
  148. }
  149. // markFilteredGroupable returns the appropriate TokenKind to mark this token as
  150. // filtered groupable.
  151. func markFilteredGroupable(token TokenKind) TokenKind {
  152. switch token {
  153. case '(':
  154. return FilteredGroupableParenthesis
  155. default:
  156. return FilteredGroupable
  157. }
  158. }
  159. // Reset resets the groupingFilter so that it may be used again.
  160. func (f *groupingFilter) Reset() {
  161. f.groupFilter = 0
  162. f.groupMulti = 0
  163. }
  164. // ObfuscateSQLString quantizes and obfuscates the given input SQL query string. Quantization removes
  165. // some elements such as comments and aliases and obfuscation attempts to hide sensitive information
  166. // in strings and numbers by redacting them.
  167. func (o *Obfuscator) ObfuscateSQLString(in string) (*ObfuscatedQuery, error) {
  168. return o.ObfuscateSQLStringWithOptions(in, &o.opts.SQL)
  169. }
  170. // ObfuscateSQLStringWithOptions accepts an optional SQLOptions to change the behavior of the obfuscator
  171. // to quantize and obfuscate the given input SQL query string. Quantization removes some elements such as comments
  172. // and aliases and obfuscation attempts to hide sensitive information in strings and numbers by redacting them.
  173. func (o *Obfuscator) ObfuscateSQLStringWithOptions(in string, opts *SQLConfig) (*ObfuscatedQuery, error) {
  174. if v, ok := o.queryCache.Get(in); ok {
  175. return v.(*ObfuscatedQuery), nil
  176. }
  177. oq, err := o.obfuscateSQLString(in, opts)
  178. if err != nil {
  179. return oq, err
  180. }
  181. o.queryCache.Set(in, oq, oq.Cost())
  182. return oq, nil
  183. }
  184. func (o *Obfuscator) obfuscateSQLString(in string, opts *SQLConfig) (*ObfuscatedQuery, error) {
  185. lesc := o.useSQLLiteralEscapes()
  186. tok := NewSQLTokenizer(in, lesc, opts)
  187. out, err := attemptObfuscation(tok)
  188. if err != nil && tok.SeenEscape() {
  189. // If the tokenizer failed, but saw an escape character in the process,
  190. // try again treating escapes differently
  191. tok = NewSQLTokenizer(in, !lesc, opts)
  192. if out, err2 := attemptObfuscation(tok); err2 == nil {
  193. // If the second attempt succeeded, change the default behavior so that
  194. // on the next run we get it right in the first run.
  195. o.setSQLLiteralEscapes(!lesc)
  196. return out, nil
  197. }
  198. }
  199. return out, err
  200. }
  201. // tableFinderFilter is a filter which attempts to identify the table name as it goes through each
  202. // token in a query.
  203. type tableFinderFilter struct {
  204. storeTableNames bool
  205. // seen keeps track of unique table names encountered by the filter.
  206. seen map[string]struct{}
  207. // csv specifies a comma-separated list of tables
  208. csv strings.Builder
  209. }
  210. // Filter implements tokenFilter.
  211. func (f *tableFinderFilter) Filter(token, lastToken TokenKind, buffer []byte) (TokenKind, []byte, error) {
  212. switch lastToken {
  213. case From, Join:
  214. // SELECT ... FROM [tableName]
  215. // DELETE FROM [tableName]
  216. // ... JOIN [tableName]
  217. if r, _ := utf8.DecodeRune(buffer); !unicode.IsLetter(r) {
  218. // first character in buffer is not a letter; we might have a nested
  219. // query like SELECT * FROM (SELECT ...)
  220. break
  221. }
  222. fallthrough
  223. case Update, Into:
  224. // UPDATE [tableName]
  225. // INSERT INTO [tableName]
  226. if f.storeTableNames {
  227. f.storeName(string(buffer))
  228. }
  229. return TableName, buffer, nil
  230. }
  231. return token, buffer, nil
  232. }
  233. // storeName marks the given table name as seen in the internal storage.
  234. func (f *tableFinderFilter) storeName(name string) {
  235. if _, ok := f.seen[name]; ok {
  236. return
  237. }
  238. if f.seen == nil {
  239. f.seen = make(map[string]struct{}, 1)
  240. }
  241. f.seen[name] = struct{}{}
  242. if f.csv.Len() > 0 {
  243. f.csv.WriteByte(',')
  244. }
  245. f.csv.WriteString(name)
  246. }
  247. // CSV returns a comma-separated list of the tables seen by the filter.
  248. func (f *tableFinderFilter) CSV() string { return f.csv.String() }
  249. // Reset implements tokenFilter.
  250. func (f *tableFinderFilter) Reset() {
  251. for k := range f.seen {
  252. delete(f.seen, k)
  253. }
  254. f.csv.Reset()
  255. }
  256. // ObfuscatedQuery specifies information about an obfuscated SQL query.
  257. type ObfuscatedQuery struct {
  258. Query string // the obfuscated SQL query
  259. TablesCSV string // comma-separated list of tables that the query addresses
  260. }
  261. // Cost returns the number of bytes needed to store all the fields
  262. // of this ObfuscatedQuery.
  263. func (oq *ObfuscatedQuery) Cost() int64 {
  264. return int64(len(oq.Query) + len(oq.TablesCSV))
  265. }
  266. // attemptObfuscation attempts to obfuscate the SQL query loaded into the tokenizer, using the given set of filters.
  267. func attemptObfuscation(tokenizer *SQLTokenizer) (*ObfuscatedQuery, error) {
  268. var (
  269. storeTableNames = tokenizer.cfg.TableNames
  270. out = bytes.NewBuffer(make([]byte, 0, len(tokenizer.buf)))
  271. err error
  272. lastToken TokenKind
  273. discard = discardFilter{tokenizer.cfg.KeepSQLAlias}
  274. replace = replaceFilter{replaceDigits: tokenizer.cfg.ReplaceDigits}
  275. grouping groupingFilter
  276. tableFinder = tableFinderFilter{storeTableNames: storeTableNames}
  277. )
  278. // call Scan() function until tokens are available or if a LEX_ERROR is raised. After
  279. // retrieving a token, send it to the tokenFilter chains so that the token is discarded
  280. // or replaced.
  281. for {
  282. token, buff := tokenizer.Scan()
  283. if token == EndChar {
  284. break
  285. }
  286. if token == LexError {
  287. return nil, fmt.Errorf("%v", tokenizer.Err())
  288. }
  289. if token, buff, err = discard.Filter(token, lastToken, buff); err != nil {
  290. return nil, err
  291. }
  292. if storeTableNames {
  293. if token, buff, err = tableFinder.Filter(token, lastToken, buff); err != nil {
  294. return nil, err
  295. }
  296. }
  297. if token, buff, err = replace.Filter(token, lastToken, buff); err != nil {
  298. return nil, err
  299. }
  300. if token, buff, err = grouping.Filter(token, lastToken, buff); err != nil {
  301. return nil, err
  302. }
  303. if buff != nil {
  304. if out.Len() != 0 {
  305. switch token {
  306. case ',':
  307. case '=':
  308. if lastToken == ':' {
  309. // do not add a space before an equals if a colon was
  310. // present before it.
  311. break
  312. }
  313. fallthrough
  314. default:
  315. out.WriteRune(' ')
  316. }
  317. }
  318. out.Write(buff)
  319. }
  320. lastToken = token
  321. }
  322. if out.Len() == 0 {
  323. return nil, errors.New("result is empty")
  324. }
  325. return &ObfuscatedQuery{
  326. Query: out.String(),
  327. TablesCSV: tableFinder.CSV(),
  328. }, nil
  329. }
  330. // ObfuscateSQLExecPlan obfuscates query conditions in the provided JSON encoded execution plan. If normalize=True,
  331. // then cost and row estimates are also obfuscated away.
  332. func (o *Obfuscator) ObfuscateSQLExecPlan(jsonPlan string, normalize bool) (string, error) {
  333. if normalize {
  334. return o.sqlExecPlanNormalize.obfuscate([]byte(jsonPlan))
  335. }
  336. return o.sqlExecPlan.obfuscate([]byte(jsonPlan))
  337. }