scanner.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617
  1. package influxql
  2. import (
  3. "bufio"
  4. "bytes"
  5. "errors"
  6. "fmt"
  7. "io"
  8. )
  9. // Scanner represents a lexical scanner for InfluxQL.
  10. type Scanner struct {
  11. r *reader
  12. }
  13. // NewScanner returns a new instance of Scanner.
  14. func NewScanner(r io.Reader) *Scanner {
  15. return &Scanner{r: &reader{r: bufio.NewReader(r)}}
  16. }
  17. // Scan returns the next token and position from the underlying reader.
  18. // Also returns the literal text read for strings, numbers, and duration tokens
  19. // since these token types can have different literal representations.
  20. func (s *Scanner) Scan() (tok Token, pos Pos, lit string) {
  21. // Read next code point.
  22. ch0, pos := s.r.read()
  23. // If we see whitespace then consume all contiguous whitespace.
  24. // If we see a letter, or certain acceptable special characters, then consume
  25. // as an ident or reserved word.
  26. if isWhitespace(ch0) {
  27. return s.scanWhitespace()
  28. } else if isLetter(ch0) || ch0 == '_' {
  29. s.r.unread()
  30. return s.scanIdent(true)
  31. } else if isDigit(ch0) {
  32. return s.scanNumber()
  33. }
  34. // Otherwise parse individual characters.
  35. switch ch0 {
  36. case eof:
  37. return EOF, pos, ""
  38. case '"':
  39. s.r.unread()
  40. return s.scanIdent(true)
  41. case '\'':
  42. return s.scanString()
  43. case '.':
  44. ch1, _ := s.r.read()
  45. s.r.unread()
  46. if isDigit(ch1) {
  47. return s.scanNumber()
  48. }
  49. return DOT, pos, ""
  50. case '$':
  51. tok, _, lit = s.scanIdent(false)
  52. if tok != IDENT {
  53. return tok, pos, "$" + lit
  54. }
  55. return BOUNDPARAM, pos, "$" + lit
  56. case '+':
  57. return ADD, pos, ""
  58. case '-':
  59. ch1, _ := s.r.read()
  60. if ch1 == '-' {
  61. s.skipUntilNewline()
  62. return COMMENT, pos, ""
  63. }
  64. s.r.unread()
  65. return SUB, pos, ""
  66. case '*':
  67. return MUL, pos, ""
  68. case '/':
  69. ch1, _ := s.r.read()
  70. if ch1 == '*' {
  71. if err := s.skipUntilEndComment(); err != nil {
  72. return ILLEGAL, pos, ""
  73. }
  74. return COMMENT, pos, ""
  75. } else {
  76. s.r.unread()
  77. }
  78. return DIV, pos, ""
  79. case '%':
  80. return MOD, pos, ""
  81. case '&':
  82. return BITWISE_AND, pos, ""
  83. case '|':
  84. return BITWISE_OR, pos, ""
  85. case '^':
  86. return BITWISE_XOR, pos, ""
  87. case '=':
  88. if ch1, _ := s.r.read(); ch1 == '~' {
  89. return EQREGEX, pos, ""
  90. }
  91. s.r.unread()
  92. return EQ, pos, ""
  93. case '!':
  94. if ch1, _ := s.r.read(); ch1 == '=' {
  95. return NEQ, pos, ""
  96. } else if ch1 == '~' {
  97. return NEQREGEX, pos, ""
  98. }
  99. s.r.unread()
  100. case '>':
  101. if ch1, _ := s.r.read(); ch1 == '=' {
  102. return GTE, pos, ""
  103. }
  104. s.r.unread()
  105. return GT, pos, ""
  106. case '<':
  107. if ch1, _ := s.r.read(); ch1 == '=' {
  108. return LTE, pos, ""
  109. } else if ch1 == '>' {
  110. return NEQ, pos, ""
  111. }
  112. s.r.unread()
  113. return LT, pos, ""
  114. case '(':
  115. return LPAREN, pos, ""
  116. case ')':
  117. return RPAREN, pos, ""
  118. case ',':
  119. return COMMA, pos, ""
  120. case ';':
  121. return SEMICOLON, pos, ""
  122. case ':':
  123. if ch1, _ := s.r.read(); ch1 == ':' {
  124. return DOUBLECOLON, pos, ""
  125. }
  126. s.r.unread()
  127. return COLON, pos, ""
  128. }
  129. return ILLEGAL, pos, string(ch0)
  130. }
  131. // scanWhitespace consumes the current rune and all contiguous whitespace.
  132. func (s *Scanner) scanWhitespace() (tok Token, pos Pos, lit string) {
  133. // Create a buffer and read the current character into it.
  134. var buf bytes.Buffer
  135. ch, pos := s.r.curr()
  136. _, _ = buf.WriteRune(ch)
  137. // Read every subsequent whitespace character into the buffer.
  138. // Non-whitespace characters and EOF will cause the loop to exit.
  139. for {
  140. ch, _ = s.r.read()
  141. if ch == eof {
  142. break
  143. } else if !isWhitespace(ch) {
  144. s.r.unread()
  145. break
  146. } else {
  147. _, _ = buf.WriteRune(ch)
  148. }
  149. }
  150. return WS, pos, buf.String()
  151. }
  152. // skipUntilNewline skips characters until it reaches a newline.
  153. func (s *Scanner) skipUntilNewline() {
  154. for {
  155. if ch, _ := s.r.read(); ch == '\n' || ch == eof {
  156. return
  157. }
  158. }
  159. }
  160. // skipUntilEndComment skips characters until it reaches a '*/' symbol.
  161. func (s *Scanner) skipUntilEndComment() error {
  162. for {
  163. if ch1, _ := s.r.read(); ch1 == '*' {
  164. // We might be at the end.
  165. star:
  166. ch2, _ := s.r.read()
  167. if ch2 == '/' {
  168. return nil
  169. } else if ch2 == '*' {
  170. // We are back in the state machine since we see a star.
  171. goto star
  172. } else if ch2 == eof {
  173. return io.EOF
  174. }
  175. } else if ch1 == eof {
  176. return io.EOF
  177. }
  178. }
  179. }
  180. func (s *Scanner) scanIdent(lookup bool) (tok Token, pos Pos, lit string) {
  181. // Save the starting position of the identifier.
  182. _, pos = s.r.read()
  183. s.r.unread()
  184. var buf bytes.Buffer
  185. for {
  186. if ch, _ := s.r.read(); ch == eof {
  187. break
  188. } else if ch == '"' {
  189. tok0, pos0, lit0 := s.scanString()
  190. if tok0 == BADSTRING || tok0 == BADESCAPE {
  191. return tok0, pos0, lit0
  192. }
  193. return IDENT, pos, lit0
  194. } else if isIdentChar(ch) {
  195. s.r.unread()
  196. buf.WriteString(ScanBareIdent(s.r))
  197. } else {
  198. s.r.unread()
  199. break
  200. }
  201. }
  202. lit = buf.String()
  203. // If the literal matches a keyword then return that keyword.
  204. if lookup {
  205. if tok = Lookup(lit); tok != IDENT {
  206. return tok, pos, ""
  207. }
  208. }
  209. return IDENT, pos, lit
  210. }
  211. // scanString consumes a contiguous string of non-quote characters.
  212. // Quote characters can be consumed if they're first escaped with a backslash.
  213. func (s *Scanner) scanString() (tok Token, pos Pos, lit string) {
  214. s.r.unread()
  215. _, pos = s.r.curr()
  216. var err error
  217. lit, err = ScanString(s.r)
  218. if err == errBadString {
  219. return BADSTRING, pos, lit
  220. } else if err == errBadEscape {
  221. _, pos = s.r.curr()
  222. return BADESCAPE, pos, lit
  223. }
  224. return STRING, pos, lit
  225. }
  226. // ScanRegex consumes a token to find escapes
  227. func (s *Scanner) ScanRegex() (tok Token, pos Pos, lit string) {
  228. _, pos = s.r.curr()
  229. // Start & end sentinels.
  230. start, end := '/', '/'
  231. // Valid escape chars.
  232. escapes := map[rune]rune{'/': '/'}
  233. b, err := ScanDelimited(s.r, start, end, escapes, true)
  234. if err == errBadEscape {
  235. _, pos = s.r.curr()
  236. return BADESCAPE, pos, lit
  237. } else if err != nil {
  238. return BADREGEX, pos, lit
  239. }
  240. return REGEX, pos, string(b)
  241. }
  242. // scanNumber consumes anything that looks like the start of a number.
  243. func (s *Scanner) scanNumber() (tok Token, pos Pos, lit string) {
  244. var buf bytes.Buffer
  245. // Check if the initial rune is a ".".
  246. ch, pos := s.r.curr()
  247. if ch == '.' {
  248. // Peek and see if the next rune is a digit.
  249. ch1, _ := s.r.read()
  250. s.r.unread()
  251. if !isDigit(ch1) {
  252. return ILLEGAL, pos, "."
  253. }
  254. // Unread the full stop so we can read it later.
  255. s.r.unread()
  256. } else {
  257. s.r.unread()
  258. }
  259. // Read as many digits as possible.
  260. _, _ = buf.WriteString(s.scanDigits())
  261. // If next code points are a full stop and digit then consume them.
  262. isDecimal := false
  263. if ch0, _ := s.r.read(); ch0 == '.' {
  264. isDecimal = true
  265. if ch1, _ := s.r.read(); isDigit(ch1) {
  266. _, _ = buf.WriteRune(ch0)
  267. _, _ = buf.WriteRune(ch1)
  268. _, _ = buf.WriteString(s.scanDigits())
  269. } else {
  270. s.r.unread()
  271. }
  272. } else {
  273. s.r.unread()
  274. }
  275. // Read as a duration or integer if it doesn't have a fractional part.
  276. if !isDecimal {
  277. // If the next rune is a letter then this is a duration token.
  278. if ch0, _ := s.r.read(); isLetter(ch0) || ch0 == 'µ' {
  279. _, _ = buf.WriteRune(ch0)
  280. for {
  281. ch1, _ := s.r.read()
  282. if !isLetter(ch1) && ch1 != 'µ' {
  283. s.r.unread()
  284. break
  285. }
  286. _, _ = buf.WriteRune(ch1)
  287. }
  288. // Continue reading digits and letters as part of this token.
  289. for {
  290. if ch0, _ := s.r.read(); isLetter(ch0) || ch0 == 'µ' || isDigit(ch0) {
  291. _, _ = buf.WriteRune(ch0)
  292. } else {
  293. s.r.unread()
  294. break
  295. }
  296. }
  297. return DURATIONVAL, pos, buf.String()
  298. } else {
  299. s.r.unread()
  300. return INTEGER, pos, buf.String()
  301. }
  302. }
  303. return NUMBER, pos, buf.String()
  304. }
  305. // scanDigits consumes a contiguous series of digits.
  306. func (s *Scanner) scanDigits() string {
  307. var buf bytes.Buffer
  308. for {
  309. ch, _ := s.r.read()
  310. if !isDigit(ch) {
  311. s.r.unread()
  312. break
  313. }
  314. _, _ = buf.WriteRune(ch)
  315. }
  316. return buf.String()
  317. }
  318. // isWhitespace returns true if the rune is a space, tab, or newline.
  319. func isWhitespace(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' }
  320. // isLetter returns true if the rune is a letter.
  321. func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
  322. // isDigit returns true if the rune is a digit.
  323. func isDigit(ch rune) bool { return (ch >= '0' && ch <= '9') }
  324. // isIdentChar returns true if the rune can be used in an unquoted identifier.
  325. func isIdentChar(ch rune) bool { return isLetter(ch) || isDigit(ch) || ch == '_' }
  326. // isIdentFirstChar returns true if the rune can be used as the first char in an unquoted identifer.
  327. func isIdentFirstChar(ch rune) bool { return isLetter(ch) || ch == '_' }
  328. // bufScanner represents a wrapper for scanner to add a buffer.
  329. // It provides a fixed-length circular buffer that can be unread.
  330. type bufScanner struct {
  331. s *Scanner
  332. i int // buffer index
  333. n int // buffer size
  334. buf [3]struct {
  335. tok Token
  336. pos Pos
  337. lit string
  338. }
  339. }
  340. // newBufScanner returns a new buffered scanner for a reader.
  341. func newBufScanner(r io.Reader) *bufScanner {
  342. return &bufScanner{s: NewScanner(r)}
  343. }
  344. // Scan reads the next token from the scanner.
  345. func (s *bufScanner) Scan() (tok Token, pos Pos, lit string) {
  346. return s.scanFunc(s.s.Scan)
  347. }
  348. // ScanRegex reads a regex token from the scanner.
  349. func (s *bufScanner) ScanRegex() (tok Token, pos Pos, lit string) {
  350. return s.scanFunc(s.s.ScanRegex)
  351. }
  352. // scanFunc uses the provided function to scan the next token.
  353. func (s *bufScanner) scanFunc(scan func() (Token, Pos, string)) (tok Token, pos Pos, lit string) {
  354. // If we have unread tokens then read them off the buffer first.
  355. if s.n > 0 {
  356. s.n--
  357. return s.curr()
  358. }
  359. // Move buffer position forward and save the token.
  360. s.i = (s.i + 1) % len(s.buf)
  361. buf := &s.buf[s.i]
  362. buf.tok, buf.pos, buf.lit = scan()
  363. return s.curr()
  364. }
  365. // Unscan pushes the previously token back onto the buffer.
  366. func (s *bufScanner) Unscan() { s.n++ }
  367. // curr returns the last read token.
  368. func (s *bufScanner) curr() (tok Token, pos Pos, lit string) {
  369. buf := &s.buf[(s.i-s.n+len(s.buf))%len(s.buf)]
  370. return buf.tok, buf.pos, buf.lit
  371. }
  372. // reader represents a buffered rune reader used by the scanner.
  373. // It provides a fixed-length circular buffer that can be unread.
  374. type reader struct {
  375. r io.RuneScanner
  376. i int // buffer index
  377. n int // buffer char count
  378. pos Pos // last read rune position
  379. buf [3]struct {
  380. ch rune
  381. pos Pos
  382. }
  383. eof bool // true if reader has ever seen eof.
  384. }
  385. // ReadRune reads the next rune from the reader.
  386. // This is a wrapper function to implement the io.RuneReader interface.
  387. // Note that this function does not return size.
  388. func (r *reader) ReadRune() (ch rune, size int, err error) {
  389. ch, _ = r.read()
  390. if ch == eof {
  391. err = io.EOF
  392. }
  393. return
  394. }
  395. // UnreadRune pushes the previously read rune back onto the buffer.
  396. // This is a wrapper function to implement the io.RuneScanner interface.
  397. func (r *reader) UnreadRune() error {
  398. r.unread()
  399. return nil
  400. }
  401. // read reads the next rune from the reader.
  402. func (r *reader) read() (ch rune, pos Pos) {
  403. // If we have unread characters then read them off the buffer first.
  404. if r.n > 0 {
  405. r.n--
  406. return r.curr()
  407. }
  408. // Read next rune from underlying reader.
  409. // Any error (including io.EOF) should return as EOF.
  410. ch, _, err := r.r.ReadRune()
  411. if err != nil {
  412. ch = eof
  413. } else if ch == '\r' {
  414. if ch, _, err := r.r.ReadRune(); err != nil {
  415. // nop
  416. } else if ch != '\n' {
  417. _ = r.r.UnreadRune()
  418. }
  419. ch = '\n'
  420. }
  421. // Save character and position to the buffer.
  422. r.i = (r.i + 1) % len(r.buf)
  423. buf := &r.buf[r.i]
  424. buf.ch, buf.pos = ch, r.pos
  425. // Update position.
  426. // Only count EOF once.
  427. if ch == '\n' {
  428. r.pos.Line++
  429. r.pos.Char = 0
  430. } else if !r.eof {
  431. r.pos.Char++
  432. }
  433. // Mark the reader as EOF.
  434. // This is used so we don't double count EOF characters.
  435. if ch == eof {
  436. r.eof = true
  437. }
  438. return r.curr()
  439. }
  440. // unread pushes the previously read rune back onto the buffer.
  441. func (r *reader) unread() {
  442. r.n++
  443. }
  444. // curr returns the last read character and position.
  445. func (r *reader) curr() (ch rune, pos Pos) {
  446. i := (r.i - r.n + len(r.buf)) % len(r.buf)
  447. buf := &r.buf[i]
  448. return buf.ch, buf.pos
  449. }
  450. // eof is a marker code point to signify that the reader can't read any more.
  451. const eof = rune(0)
  452. // ScanDelimited reads a delimited set of runes
  453. func ScanDelimited(r io.RuneScanner, start, end rune, escapes map[rune]rune, escapesPassThru bool) ([]byte, error) {
  454. // Scan start delimiter.
  455. if ch, _, err := r.ReadRune(); err != nil {
  456. return nil, err
  457. } else if ch != start {
  458. return nil, fmt.Errorf("expected %s; found %s", string(start), string(ch))
  459. }
  460. var buf bytes.Buffer
  461. for {
  462. ch0, _, err := r.ReadRune()
  463. if ch0 == end {
  464. return buf.Bytes(), nil
  465. } else if err != nil {
  466. return buf.Bytes(), err
  467. } else if ch0 == '\n' {
  468. return nil, errors.New("delimited text contains new line")
  469. } else if ch0 == '\\' {
  470. // If the next character is an escape then write the escaped char.
  471. // If it's not a valid escape then return an error.
  472. ch1, _, err := r.ReadRune()
  473. if err != nil {
  474. return nil, err
  475. }
  476. c, ok := escapes[ch1]
  477. if !ok {
  478. if escapesPassThru {
  479. // Unread ch1 (char after the \)
  480. _ = r.UnreadRune()
  481. // Write ch0 (\) to the output buffer.
  482. _, _ = buf.WriteRune(ch0)
  483. continue
  484. } else {
  485. buf.Reset()
  486. _, _ = buf.WriteRune(ch0)
  487. _, _ = buf.WriteRune(ch1)
  488. return buf.Bytes(), errBadEscape
  489. }
  490. }
  491. _, _ = buf.WriteRune(c)
  492. } else {
  493. _, _ = buf.WriteRune(ch0)
  494. }
  495. }
  496. }
  497. // ScanString reads a quoted string from a rune reader.
  498. func ScanString(r io.RuneScanner) (string, error) {
  499. ending, _, err := r.ReadRune()
  500. if err != nil {
  501. return "", errBadString
  502. }
  503. var buf bytes.Buffer
  504. for {
  505. ch0, _, err := r.ReadRune()
  506. if ch0 == ending {
  507. return buf.String(), nil
  508. } else if err != nil || ch0 == '\n' {
  509. return buf.String(), errBadString
  510. } else if ch0 == '\\' {
  511. // If the next character is an escape then write the escaped char.
  512. // If it's not a valid escape then return an error.
  513. ch1, _, _ := r.ReadRune()
  514. if ch1 == 'n' {
  515. _, _ = buf.WriteRune('\n')
  516. } else if ch1 == '\\' {
  517. _, _ = buf.WriteRune('\\')
  518. } else if ch1 == '"' {
  519. _, _ = buf.WriteRune('"')
  520. } else if ch1 == '\'' {
  521. _, _ = buf.WriteRune('\'')
  522. } else {
  523. return string(ch0) + string(ch1), errBadEscape
  524. }
  525. } else {
  526. _, _ = buf.WriteRune(ch0)
  527. }
  528. }
  529. }
  530. var errBadString = errors.New("bad string")
  531. var errBadEscape = errors.New("bad escape")
  532. // ScanBareIdent reads bare identifier from a rune reader.
  533. func ScanBareIdent(r io.RuneScanner) string {
  534. // Read every ident character into the buffer.
  535. // Non-ident characters and EOF will cause the loop to exit.
  536. var buf bytes.Buffer
  537. for {
  538. ch, _, err := r.ReadRune()
  539. if err != nil {
  540. break
  541. } else if !isIdentChar(ch) {
  542. r.UnreadRune()
  543. break
  544. } else {
  545. _, _ = buf.WriteRune(ch)
  546. }
  547. }
  548. return buf.String()
  549. }
  550. // IsRegexOp returns true if the operator accepts a regex operand.
  551. func IsRegexOp(t Token) bool {
  552. return (t == EQREGEX || t == NEQREGEX)
  553. }