sql_tokenizer.go 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767
  1. // Unless explicitly stated otherwise all files in this repository are licensed
  2. // under the Apache License Version 2.0.
  3. // This product includes software developed at Datadog (https://www.datadoghq.com/).
  4. // Copyright 2016-present Datadog, Inc.
  5. package obfuscate
  6. import (
  7. "bytes"
  8. "fmt"
  9. "unicode"
  10. "unicode/utf8"
  11. )
  12. // tokenizer.go implemenents a lexer-like iterator that tokenizes SQL and CQL
  13. // strings, so that an external component can filter or alter each token of the
  14. // string. This implementation can't be used as a real SQL lexer (so a parser
  15. // cannot build the AST) because many rules are ignored to make the tokenizer
  16. // simpler.
  17. // This implementation was inspired by https://github.com/youtube/vitess sql parser
  18. // TODO: add the license to the NOTICE file
  19. // TokenKind specifies the type of the token being scanned. It may be one of the defined
  20. // constants below or in some cases the actual rune itself.
  21. type TokenKind uint32
  22. // EndChar is used to signal that the scanner has finished reading the query. This happens when
  23. // there are no more characters left in the query or when invalid encoding is discovered. EndChar
  24. // is an invalid rune value that can not be found in any valid string.
  25. const EndChar = unicode.MaxRune + 1
  26. // list of available tokens; this list has been reduced because we don't
  27. // need a full-fledged tokenizer to implement a Lexer
  28. const (
  29. LexError = TokenKind(57346) + iota
  30. ID
  31. Limit
  32. Null
  33. String
  34. DoubleQuotedString
  35. DollarQuotedString // https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
  36. DollarQuotedFunc // a dollar-quoted string delimited by the tag "$func$"; gets special treatment when feature "dollar_quoted_func" is set
  37. Number
  38. BooleanLiteral
  39. ValueArg
  40. ListArg
  41. Comment
  42. Variable
  43. Savepoint
  44. PreparedStatement
  45. EscapeSequence
  46. NullSafeEqual
  47. LE
  48. GE
  49. NE
  50. Not
  51. As
  52. From
  53. Update
  54. Insert
  55. Into
  56. Join
  57. TableName
  58. ColonCast
  59. // FilteredGroupable specifies that the given token has been discarded by one of the
  60. // token filters and that it is groupable together with consecutive FilteredGroupable
  61. // tokens.
  62. FilteredGroupable
  63. // FilteredGroupableParenthesis is a parenthesis marked as filtered groupable. It is the
  64. // beginning of either a group of values ('(') or a nested query. We track is as
  65. // a special case for when it may start a nested query as opposed to just another
  66. // value group to be obfuscated.
  67. FilteredGroupableParenthesis
  68. // Filtered specifies that the token is a comma and was discarded by one
  69. // of the filters.
  70. Filtered
  71. // FilteredBracketedIdentifier specifies that we are currently discarding
  72. // a bracketed identifier (MSSQL).
  73. // See issue https://github.com/DataDog/datadog-trace-agent/issues/475.
  74. FilteredBracketedIdentifier
  75. )
  76. var tokenKindStrings = map[TokenKind]string{
  77. LexError: "LexError",
  78. ID: "ID",
  79. Limit: "Limit",
  80. Null: "Null",
  81. String: "String",
  82. DoubleQuotedString: "DoubleQuotedString",
  83. DollarQuotedString: "DollarQuotedString",
  84. DollarQuotedFunc: "DollarQuotedFunc",
  85. Number: "Number",
  86. BooleanLiteral: "BooleanLiteral",
  87. ValueArg: "ValueArg",
  88. ListArg: "ListArg",
  89. Comment: "Comment",
  90. Variable: "Variable",
  91. Savepoint: "Savepoint",
  92. PreparedStatement: "PreparedStatement",
  93. EscapeSequence: "EscapeSequence",
  94. NullSafeEqual: "NullSafeEqual",
  95. LE: "LE",
  96. GE: "GE",
  97. NE: "NE",
  98. Not: "NOT",
  99. As: "As",
  100. From: "From",
  101. Update: "Update",
  102. Insert: "Insert",
  103. Into: "Into",
  104. Join: "Join",
  105. TableName: "TableName",
  106. ColonCast: "ColonCast",
  107. FilteredGroupable: "FilteredGroupable",
  108. FilteredGroupableParenthesis: "FilteredGroupableParenthesis",
  109. Filtered: "Filtered",
  110. FilteredBracketedIdentifier: "FilteredBracketedIdentifier",
  111. }
  112. func (k TokenKind) String() string {
  113. str, ok := tokenKindStrings[k]
  114. if !ok {
  115. return "<unknown>"
  116. }
  117. return str
  118. }
  119. const escapeCharacter = '\\'
  120. // SQLTokenizer is the struct used to generate SQL
  121. // tokens for the parser.
  122. type SQLTokenizer struct {
  123. pos int // byte offset of lastChar
  124. lastChar rune // last read rune
  125. buf []byte // buf holds the query that we are parsing
  126. off int // off is the index into buf where the unread portion of the query begins.
  127. err error // any error occurred while reading
  128. curlys uint32 // number of active open curly braces in top-level SQL escape sequences.
  129. literalEscapes bool // indicates we should not treat backslashes as escape characters
  130. seenEscape bool // indicates whether this tokenizer has seen an escape character within a string
  131. cfg *SQLConfig
  132. }
  133. // NewSQLTokenizer creates a new SQLTokenizer for the given SQL string. The literalEscapes argument specifies
  134. // whether escape characters should be treated literally or as such.
  135. func NewSQLTokenizer(sql string, literalEscapes bool, cfg *SQLConfig) *SQLTokenizer {
  136. if cfg == nil {
  137. cfg = new(SQLConfig)
  138. }
  139. return &SQLTokenizer{
  140. buf: []byte(sql),
  141. cfg: cfg,
  142. literalEscapes: literalEscapes,
  143. }
  144. }
  145. // Reset the underlying buffer and positions
  146. func (tkn *SQLTokenizer) Reset(in string) {
  147. tkn.pos = 0
  148. tkn.lastChar = 0
  149. tkn.buf = []byte(in)
  150. tkn.off = 0
  151. tkn.err = nil
  152. }
  153. // keywords used to recognize string tokens
  154. var keywords = map[string]TokenKind{
  155. "NULL": Null,
  156. "TRUE": BooleanLiteral,
  157. "FALSE": BooleanLiteral,
  158. "SAVEPOINT": Savepoint,
  159. "LIMIT": Limit,
  160. "AS": As,
  161. "FROM": From,
  162. "UPDATE": Update,
  163. "INSERT": Insert,
  164. "INTO": Into,
  165. "JOIN": Join,
  166. }
  167. // Err returns the last error that the tokenizer encountered, or nil.
  168. func (tkn *SQLTokenizer) Err() error { return tkn.err }
  169. func (tkn *SQLTokenizer) setErr(format string, args ...interface{}) {
  170. if tkn.err != nil {
  171. return
  172. }
  173. tkn.err = fmt.Errorf("at position %d: %v", tkn.pos, fmt.Errorf(format, args...))
  174. }
  175. // SeenEscape returns whether or not this tokenizer has seen an escape character within a scanned string
  176. func (tkn *SQLTokenizer) SeenEscape() bool { return tkn.seenEscape }
  177. // Scan scans the tokenizer for the next token and returns
  178. // the token type and the token buffer.
  179. func (tkn *SQLTokenizer) Scan() (TokenKind, []byte) {
  180. if tkn.lastChar == 0 {
  181. tkn.advance()
  182. }
  183. tkn.skipBlank()
  184. switch ch := tkn.lastChar; {
  185. case isLeadingLetter(ch):
  186. return tkn.scanIdentifier()
  187. case isDigit(ch):
  188. return tkn.scanNumber(false)
  189. default:
  190. tkn.advance()
  191. if tkn.lastChar == EndChar && tkn.err != nil {
  192. // advance discovered an invalid encoding. We should return early.
  193. return LexError, nil
  194. }
  195. switch ch {
  196. case EndChar:
  197. if tkn.err != nil {
  198. return LexError, nil
  199. }
  200. return EndChar, nil
  201. case ':':
  202. if tkn.lastChar == ':' {
  203. tkn.advance()
  204. return ColonCast, []byte("::")
  205. }
  206. if unicode.IsSpace(tkn.lastChar) {
  207. // example scenario: "autovacuum: VACUUM ANALYZE fake.table"
  208. return TokenKind(ch), tkn.bytes()
  209. }
  210. if tkn.lastChar != '=' {
  211. return tkn.scanBindVar()
  212. }
  213. fallthrough
  214. case '~':
  215. switch tkn.lastChar {
  216. case '*':
  217. tkn.advance()
  218. return TokenKind('~'), []byte("~*")
  219. default:
  220. return TokenKind(ch), tkn.bytes()
  221. }
  222. case '=', ',', ';', '(', ')', '+', '*', '&', '|', '^', '[', ']', '?':
  223. return TokenKind(ch), tkn.bytes()
  224. case '.':
  225. if isDigit(tkn.lastChar) {
  226. return tkn.scanNumber(true)
  227. }
  228. return TokenKind(ch), tkn.bytes()
  229. case '/':
  230. switch tkn.lastChar {
  231. case '/':
  232. tkn.advance()
  233. return tkn.scanCommentType1("//")
  234. case '*':
  235. tkn.advance()
  236. return tkn.scanCommentType2()
  237. default:
  238. return TokenKind(ch), tkn.bytes()
  239. }
  240. case '-':
  241. switch {
  242. case tkn.lastChar == '-':
  243. tkn.advance()
  244. return tkn.scanCommentType1("--")
  245. case isDigit(tkn.lastChar):
  246. tkn.advance()
  247. kind, tokenBytes := tkn.scanNumber(false)
  248. return kind, append([]byte{'-'}, tokenBytes...)
  249. default:
  250. return TokenKind(ch), tkn.bytes()
  251. }
  252. case '#':
  253. tkn.advance()
  254. return tkn.scanCommentType1("#")
  255. case '<':
  256. switch tkn.lastChar {
  257. case '>':
  258. tkn.advance()
  259. return NE, []byte("<>")
  260. case '=':
  261. tkn.advance()
  262. switch tkn.lastChar {
  263. case '>':
  264. tkn.advance()
  265. return NullSafeEqual, []byte("<=>")
  266. default:
  267. return LE, []byte("<=")
  268. }
  269. default:
  270. return TokenKind(ch), tkn.bytes()
  271. }
  272. case '>':
  273. if tkn.lastChar == '=' {
  274. tkn.advance()
  275. return GE, []byte(">=")
  276. }
  277. return TokenKind(ch), tkn.bytes()
  278. case '!':
  279. switch tkn.lastChar {
  280. case '=':
  281. tkn.advance()
  282. return NE, []byte("!=")
  283. case '~':
  284. tkn.advance()
  285. switch tkn.lastChar {
  286. case '*':
  287. tkn.advance()
  288. return NE, []byte("!~*")
  289. default:
  290. return NE, []byte("!~")
  291. }
  292. default:
  293. if isValidCharAfterOperator(tkn.lastChar) {
  294. return Not, tkn.bytes()
  295. }
  296. tkn.setErr(`unexpected char "%c" (%d) after "!"`, tkn.lastChar, tkn.lastChar)
  297. return LexError, tkn.bytes()
  298. }
  299. case '\'':
  300. return tkn.scanString(ch, String)
  301. case '"':
  302. return tkn.scanString(ch, DoubleQuotedString)
  303. case '`':
  304. return tkn.scanString(ch, ID)
  305. case '%':
  306. if tkn.lastChar == '(' {
  307. return tkn.scanVariableIdentifier('%')
  308. }
  309. if isLetter(tkn.lastChar) {
  310. // format parameter (e.g. '%s')
  311. return tkn.scanFormatParameter('%')
  312. }
  313. // modulo operator (e.g. 'id % 8')
  314. return TokenKind(ch), tkn.bytes()
  315. case '$':
  316. if isDigit(tkn.lastChar) {
  317. // TODO(gbbr): the first digit after $ does not necessarily guarantee
  318. // that this isn't a dollar-quoted string constant. We might eventually
  319. // want to cover for this use-case too (e.g. $1$some text$1$).
  320. return tkn.scanPreparedStatement('$')
  321. }
  322. kind, tok := tkn.scanDollarQuotedString()
  323. if kind == DollarQuotedFunc {
  324. // this is considered an embedded query, we should try and
  325. // obfuscate it
  326. out, err := attemptObfuscation(NewSQLTokenizer(string(tok), tkn.literalEscapes, tkn.cfg))
  327. if err != nil {
  328. // if we can't obfuscate it, treat it as a regular string
  329. return DollarQuotedString, tok
  330. }
  331. tok = append(append([]byte("$func$"), []byte(out.Query)...), []byte("$func$")...)
  332. }
  333. return kind, tok
  334. case '{':
  335. if tkn.pos == 1 || tkn.curlys > 0 {
  336. // Do not fully obfuscate top-level SQL escape sequences like {{[?=]call procedure-name[([parameter][,parameter]...)]}.
  337. // We want these to display a bit more context than just a plain '?'
  338. // See: https://docs.oracle.com/cd/E13157_01/wlevs/docs30/jdbc_drivers/sqlescape.html
  339. tkn.curlys++
  340. return TokenKind(ch), tkn.bytes()
  341. }
  342. return tkn.scanEscapeSequence('{')
  343. case '}':
  344. if tkn.curlys == 0 {
  345. // A closing curly brace has no place outside an in-progress top-level SQL escape sequence
  346. // started by the '{' switch-case.
  347. tkn.setErr(`unexpected byte %d`, ch)
  348. return LexError, tkn.bytes()
  349. }
  350. tkn.curlys--
  351. return TokenKind(ch), tkn.bytes()
  352. default:
  353. tkn.setErr(`unexpected byte %d`, ch)
  354. return LexError, tkn.bytes()
  355. }
  356. }
  357. }
  358. func (tkn *SQLTokenizer) skipBlank() {
  359. for unicode.IsSpace(tkn.lastChar) {
  360. tkn.advance()
  361. }
  362. tkn.bytes()
  363. }
  364. // toUpper is a modified version of bytes.ToUpper. It returns an upper-cased version of the byte
  365. // slice src with all Unicode letters mapped to their upper case. It is modified to also accept a
  366. // byte slice dst as an argument, the underlying storage of which (up to the capacity of dst)
  367. // will be used as the destination of the upper-case copy of src, if it fits. As a special case,
  368. // toUpper will return src if the byte slice is already upper-case. This function is used rather
  369. // than bytes.ToUpper to improve the memory performance of the obfuscator by saving unnecessary
  370. // allocations happening in bytes.ToUpper
  371. func toUpper(src, dst []byte) []byte {
  372. dst = dst[:0]
  373. isASCII, hasLower := true, false
  374. for i := 0; i < len(src); i++ {
  375. c := src[i]
  376. if c >= utf8.RuneSelf {
  377. isASCII = false
  378. break
  379. }
  380. hasLower = hasLower || ('a' <= c && c <= 'z')
  381. }
  382. if cap(dst) < len(src) {
  383. dst = make([]byte, 0, len(src))
  384. }
  385. if isASCII { // optimize for ASCII-only byte slices.
  386. if !hasLower {
  387. // Just return src.
  388. return src
  389. }
  390. dst = dst[:len(src)]
  391. for i := 0; i < len(src); i++ {
  392. c := src[i]
  393. if 'a' <= c && c <= 'z' {
  394. c -= 'a' - 'A'
  395. }
  396. dst[i] = c
  397. }
  398. return dst
  399. }
  400. // This *could* be optimized, but it's an uncommon case.
  401. return bytes.Map(unicode.ToUpper, src)
  402. }
  403. func (tkn *SQLTokenizer) scanIdentifier() (TokenKind, []byte) {
  404. tkn.advance()
  405. for isLetter(tkn.lastChar) || isDigit(tkn.lastChar) || tkn.lastChar == '.' || tkn.lastChar == '*' {
  406. tkn.advance()
  407. }
  408. t := tkn.bytes()
  409. // Space allows us to upper-case identifiers 256 bytes long or less without allocating heap
  410. // storage for them, since space is allocated on the stack. A size of 256 bytes was chosen
  411. // based on the allowed length of sql identifiers in various sql implementations.
  412. var space [256]byte
  413. upper := toUpper(t, space[:0])
  414. if keywordID, found := keywords[string(upper)]; found {
  415. return keywordID, t
  416. }
  417. return ID, t
  418. }
  419. func (tkn *SQLTokenizer) scanVariableIdentifier(prefix rune) (TokenKind, []byte) {
  420. for tkn.advance(); tkn.lastChar != ')' && tkn.lastChar != EndChar; tkn.advance() {
  421. }
  422. tkn.advance()
  423. if !isLetter(tkn.lastChar) {
  424. tkn.setErr(`invalid character after variable identifier: "%c" (%d)`, tkn.lastChar, tkn.lastChar)
  425. return LexError, tkn.bytes()
  426. }
  427. tkn.advance()
  428. return Variable, tkn.bytes()
  429. }
  430. func (tkn *SQLTokenizer) scanFormatParameter(prefix rune) (TokenKind, []byte) {
  431. tkn.advance()
  432. return Variable, tkn.bytes()
  433. }
  434. // scanDollarQuotedString scans a Postgres dollar-quoted string constant.
  435. // See: https://www.postgresql.org/docs/current/sql-syntax-lexical.html#SQL-SYNTAX-DOLLAR-QUOTING
  436. func (tkn *SQLTokenizer) scanDollarQuotedString() (TokenKind, []byte) {
  437. kind, tag := tkn.scanString('$', String)
  438. if kind == LexError {
  439. return kind, tkn.bytes()
  440. }
  441. var (
  442. got int
  443. buf bytes.Buffer
  444. )
  445. delim := tag
  446. // on empty strings, tkn.scanString returns the delimiters
  447. if string(delim) != "$$" {
  448. // on non-empty strings, the delimiter is $tag$
  449. delim = append([]byte{'$'}, delim...)
  450. delim = append(delim, '$')
  451. }
  452. for {
  453. ch := tkn.lastChar
  454. tkn.advance()
  455. if ch == EndChar {
  456. tkn.setErr("unexpected EOF in dollar-quoted string")
  457. return LexError, buf.Bytes()
  458. }
  459. if byte(ch) == delim[got] {
  460. got++
  461. if got == len(delim) {
  462. break
  463. }
  464. continue
  465. }
  466. if got > 0 {
  467. _, err := buf.Write(delim[:got])
  468. if err != nil {
  469. tkn.setErr("error reading dollar-quoted string: %v", err)
  470. return LexError, buf.Bytes()
  471. }
  472. got = 0
  473. }
  474. buf.WriteRune(ch)
  475. }
  476. if tkn.cfg.DollarQuotedFunc && string(delim) == "$func$" {
  477. return DollarQuotedFunc, buf.Bytes()
  478. }
  479. return DollarQuotedString, buf.Bytes()
  480. }
  481. func (tkn *SQLTokenizer) scanPreparedStatement(prefix rune) (TokenKind, []byte) {
  482. // a prepared statement expect a digit identifier like $1
  483. if !isDigit(tkn.lastChar) {
  484. tkn.setErr(`prepared statements must start with digits, got "%c" (%d)`, tkn.lastChar, tkn.lastChar)
  485. return LexError, tkn.bytes()
  486. }
  487. // scanNumber keeps the prefix rune intact.
  488. // read numbers and return an error if any
  489. token, buff := tkn.scanNumber(false)
  490. if token == LexError {
  491. tkn.setErr("invalid number")
  492. return LexError, tkn.bytes()
  493. }
  494. return PreparedStatement, buff
  495. }
  496. func (tkn *SQLTokenizer) scanEscapeSequence(braces rune) (TokenKind, []byte) {
  497. for tkn.lastChar != '}' && tkn.lastChar != EndChar {
  498. tkn.advance()
  499. }
  500. // we've reached the end of the string without finding
  501. // the closing curly braces
  502. if tkn.lastChar == EndChar {
  503. tkn.setErr("unexpected EOF in escape sequence")
  504. return LexError, tkn.bytes()
  505. }
  506. tkn.advance()
  507. return EscapeSequence, tkn.bytes()
  508. }
  509. func (tkn *SQLTokenizer) scanBindVar() (TokenKind, []byte) {
  510. token := ValueArg
  511. if tkn.lastChar == ':' {
  512. token = ListArg
  513. tkn.advance()
  514. }
  515. if !isLetter(tkn.lastChar) {
  516. tkn.setErr(`bind variables should start with letters, got "%c" (%d)`, tkn.lastChar, tkn.lastChar)
  517. return LexError, tkn.bytes()
  518. }
  519. for isLetter(tkn.lastChar) || isDigit(tkn.lastChar) || tkn.lastChar == '.' {
  520. tkn.advance()
  521. }
  522. return token, tkn.bytes()
  523. }
  524. func (tkn *SQLTokenizer) scanMantissa(base int) {
  525. for digitVal(tkn.lastChar) < base {
  526. tkn.advance()
  527. }
  528. }
  529. func (tkn *SQLTokenizer) scanNumber(seenDecimalPoint bool) (TokenKind, []byte) {
  530. if seenDecimalPoint {
  531. tkn.scanMantissa(10)
  532. goto exponent
  533. }
  534. if tkn.lastChar == '0' {
  535. // int or float
  536. tkn.advance()
  537. if tkn.lastChar == 'x' || tkn.lastChar == 'X' {
  538. // hexadecimal int
  539. tkn.advance()
  540. tkn.scanMantissa(16)
  541. } else {
  542. // octal int or float
  543. seenDecimalDigit := false
  544. tkn.scanMantissa(8)
  545. if tkn.lastChar == '8' || tkn.lastChar == '9' {
  546. // illegal octal int or float
  547. seenDecimalDigit = true
  548. tkn.scanMantissa(10)
  549. }
  550. if tkn.lastChar == '.' || tkn.lastChar == 'e' || tkn.lastChar == 'E' {
  551. goto fraction
  552. }
  553. // octal int
  554. if seenDecimalDigit {
  555. // tkn.setErr called in caller
  556. return LexError, tkn.bytes()
  557. }
  558. }
  559. goto exit
  560. }
  561. // decimal int or float
  562. tkn.scanMantissa(10)
  563. fraction:
  564. if tkn.lastChar == '.' {
  565. tkn.advance()
  566. tkn.scanMantissa(10)
  567. }
  568. exponent:
  569. if tkn.lastChar == 'e' || tkn.lastChar == 'E' {
  570. tkn.advance()
  571. if tkn.lastChar == '+' || tkn.lastChar == '-' {
  572. tkn.advance()
  573. }
  574. tkn.scanMantissa(10)
  575. }
  576. exit:
  577. t := tkn.bytes()
  578. if len(t) == 0 {
  579. return LexError, nil
  580. }
  581. return Number, t
  582. }
  583. func (tkn *SQLTokenizer) scanString(delim rune, kind TokenKind) (TokenKind, []byte) {
  584. buf := bytes.NewBuffer(tkn.buf[:0])
  585. for {
  586. ch := tkn.lastChar
  587. tkn.advance()
  588. if ch == delim {
  589. if tkn.lastChar == delim {
  590. // doubling a delimiter is the default way to embed the delimiter within a string
  591. tkn.advance()
  592. } else {
  593. // a single delimiter denotes the end of the string
  594. break
  595. }
  596. } else if ch == escapeCharacter {
  597. tkn.seenEscape = true
  598. if !tkn.literalEscapes {
  599. // treat as an escape character
  600. ch = tkn.lastChar
  601. tkn.advance()
  602. }
  603. }
  604. if ch == EndChar {
  605. tkn.setErr("unexpected EOF in string")
  606. return LexError, buf.Bytes()
  607. }
  608. buf.WriteRune(ch)
  609. }
  610. if kind == ID && buf.Len() == 0 || bytes.IndexFunc(buf.Bytes(), func(r rune) bool { return !unicode.IsSpace(r) }) == -1 {
  611. // This string is an empty or white-space only identifier.
  612. // We should keep the start and end delimiters in order to
  613. // avoid creating invalid queries.
  614. // See: https://github.com/DataDog/datadog-trace-agent/issues/316
  615. return kind, append(runeBytes(delim), runeBytes(delim)...)
  616. }
  617. return kind, buf.Bytes()
  618. }
  619. func (tkn *SQLTokenizer) scanCommentType1(prefix string) (TokenKind, []byte) {
  620. for tkn.lastChar != EndChar {
  621. if tkn.lastChar == '\n' {
  622. tkn.advance()
  623. break
  624. }
  625. tkn.advance()
  626. }
  627. return Comment, tkn.bytes()
  628. }
  629. func (tkn *SQLTokenizer) scanCommentType2() (TokenKind, []byte) {
  630. for {
  631. if tkn.lastChar == '*' {
  632. tkn.advance()
  633. if tkn.lastChar == '/' {
  634. tkn.advance()
  635. break
  636. }
  637. continue
  638. }
  639. if tkn.lastChar == EndChar {
  640. tkn.setErr("unexpected EOF in comment")
  641. return LexError, tkn.bytes()
  642. }
  643. tkn.advance()
  644. }
  645. return Comment, tkn.bytes()
  646. }
  647. // advance advances the tokenizer to the next rune. If the decoder encounters an error decoding, or
  648. // the end of the buffer is reached, tkn.lastChar will be set to EndChar. In case of a decoding
  649. // error, tkn.err will also be set.
  650. func (tkn *SQLTokenizer) advance() {
  651. ch, n := utf8.DecodeRune(tkn.buf[tkn.off:])
  652. if ch == utf8.RuneError && n < 2 {
  653. tkn.pos++
  654. tkn.lastChar = EndChar
  655. if n == 1 {
  656. tkn.setErr("invalid UTF-8 encoding beginning with 0x%x", tkn.buf[tkn.off])
  657. }
  658. return
  659. }
  660. if tkn.lastChar != 0 || tkn.pos > 0 {
  661. // we are past the first character
  662. tkn.pos += n
  663. }
  664. tkn.off += n
  665. tkn.lastChar = ch
  666. }
  667. // bytes returns all the bytes that were advanced over since its last call.
  668. // This excludes tkn.lastChar, which will remain in the buffer
  669. func (tkn *SQLTokenizer) bytes() []byte {
  670. if tkn.lastChar == EndChar {
  671. ret := tkn.buf[:tkn.off]
  672. tkn.buf = tkn.buf[tkn.off:]
  673. tkn.off = 0
  674. return ret
  675. }
  676. lastLen := utf8.RuneLen(tkn.lastChar)
  677. ret := tkn.buf[:tkn.off-lastLen]
  678. tkn.buf = tkn.buf[tkn.off-lastLen:]
  679. tkn.off = lastLen
  680. return ret
  681. }
  682. func isLeadingLetter(ch rune) bool {
  683. return unicode.IsLetter(ch) || ch == '_' || ch == '@'
  684. }
  685. func isLetter(ch rune) bool {
  686. return isLeadingLetter(ch) || ch == '#'
  687. }
  688. func digitVal(ch rune) int {
  689. switch {
  690. case '0' <= ch && ch <= '9':
  691. return int(ch) - '0'
  692. case 'a' <= ch && ch <= 'f':
  693. return int(ch) - 'a' + 10
  694. case 'A' <= ch && ch <= 'F':
  695. return int(ch) - 'A' + 10
  696. }
  697. return 16 // larger than any legal digit val
  698. }
  699. func isDigit(ch rune) bool { return '0' <= ch && ch <= '9' }
  700. // runeBytes converts the given rune to a slice of bytes.
  701. func runeBytes(r rune) []byte {
  702. buf := make([]byte, utf8.UTFMax)
  703. n := utf8.EncodeRune(buf, r)
  704. return buf[:n]
  705. }
  706. // isValidCharAfterOperator returns true if c is a valid character after an operator
  707. func isValidCharAfterOperator(c rune) bool {
  708. return c == '(' || c == '`' || c == '\'' || c == '"' || c == '+' || c == '-' || unicode.IsSpace(c) || isLetter(c) || isDigit(c)
  709. }