scanner.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. // Copyright 2010 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package json
  5. // JSON value parser state machine.
  6. // Just about at the limit of what is reasonable to write by hand.
  7. // Some parts are a bit tedious, but overall it nicely factors out the
  8. // otherwise common code from the multiple scanning functions
  9. // in this package (Compact, Indent, checkValid, etc).
  10. //
  11. // This file starts with two simple examples using the scanner
  12. // before diving into the scanner itself.
  13. import (
  14. "strconv"
  15. "sync"
  16. )
  17. // Valid reports whether data is a valid JSON encoding.
  18. func Valid(data []byte) bool {
  19. scan := newScanner()
  20. defer freeScanner(scan)
  21. return checkValid(data, scan) == nil
  22. }
  23. // checkValid verifies that data is valid JSON-encoded data.
  24. // scan is passed in for use by checkValid to avoid an allocation.
  25. func checkValid(data []byte, scan *scanner) error {
  26. scan.reset()
  27. for _, c := range data {
  28. scan.bytes++
  29. if scan.step(scan, c) == scanError {
  30. return scan.err
  31. }
  32. }
  33. if scan.eof() == scanError {
  34. return scan.err
  35. }
  36. return nil
  37. }
  38. // A SyntaxError is a description of a JSON syntax error.
  39. type SyntaxError struct {
  40. msg string // description of error
  41. Offset int64 // error occurred after reading Offset bytes
  42. }
  43. func (e *SyntaxError) Error() string { return e.msg }
  44. // A scanner is a JSON scanning state machine.
  45. // Callers call scan.reset and then pass bytes in one at a time
  46. // by calling scan.step(&scan, c) for each byte.
  47. // The return value, referred to as an opcode, tells the
  48. // caller about significant parsing events like beginning
  49. // and ending literals, objects, and arrays, so that the
  50. // caller can follow along if it wishes.
  51. // The return value scanEnd indicates that a single top-level
  52. // JSON value has been completed, *before* the byte that
  53. // just got passed in. (The indication must be delayed in order
  54. // to recognize the end of numbers: is 123 a whole value or
  55. // the beginning of 12345e+6?).
  56. type scanner struct {
  57. // The step is a func to be called to execute the next transition.
  58. // Also tried using an integer constant and a single func
  59. // with a switch, but using the func directly was 10% faster
  60. // on a 64-bit Mac Mini, and it's nicer to read.
  61. step func(*scanner, byte) int
  62. // Reached end of top-level value.
  63. endTop bool
  64. // Stack of what we're in the middle of - array values, object keys, object values.
  65. parseState []int
  66. // Error that happened, if any.
  67. err error
  68. // total bytes consumed, updated by decoder.Decode (and deliberately
  69. // not set to zero by scan.reset)
  70. bytes int64
  71. }
  72. var scannerPool = sync.Pool{
  73. New: func() interface{} {
  74. return &scanner{}
  75. },
  76. }
  77. func newScanner() *scanner {
  78. scan := scannerPool.Get().(*scanner)
  79. // scan.reset by design doesn't set bytes to zero
  80. scan.bytes = 0
  81. scan.reset()
  82. return scan
  83. }
  84. func freeScanner(scan *scanner) {
  85. // Avoid hanging on to too much memory in extreme cases.
  86. if len(scan.parseState) > 1024 {
  87. scan.parseState = nil
  88. }
  89. scannerPool.Put(scan)
  90. }
  91. // These values are returned by the state transition functions
  92. // assigned to scanner.state and the method scanner.eof.
  93. // They give details about the current state of the scan that
  94. // callers might be interested to know about.
  95. // It is okay to ignore the return value of any particular
  96. // call to scanner.state: if one call returns scanError,
  97. // every subsequent call will return scanError too.
  98. const (
  99. // Continue.
  100. scanContinue = iota // uninteresting byte
  101. scanBeginLiteral // end implied by next result != scanContinue
  102. scanBeginObject // begin object
  103. scanObjectKey // just finished object key (string)
  104. scanObjectValue // just finished non-last object value
  105. scanEndObject // end object (implies scanObjectValue if possible)
  106. scanBeginArray // begin array
  107. scanArrayValue // just finished array value
  108. scanEndArray // end array (implies scanArrayValue if possible)
  109. scanSkipSpace // space byte; can skip; known to be last "continue" result
  110. // Stop.
  111. scanEnd // top-level value ended *before* this byte; known to be first "stop" result
  112. scanError // hit an error, scanner.err.
  113. )
  114. // These values are stored in the parseState stack.
  115. // They give the current state of a composite value
  116. // being scanned. If the parser is inside a nested value
  117. // the parseState describes the nested state, outermost at entry 0.
  118. const (
  119. parseObjectKey = iota // parsing object key (before colon)
  120. parseObjectValue // parsing object value (after colon)
  121. parseArrayValue // parsing array value
  122. )
  123. // This limits the max nesting depth to prevent stack overflow.
  124. // This is permitted by https://tools.ietf.org/html/rfc7159#section-9
  125. const maxNestingDepth = 10000
  126. // reset prepares the scanner for use.
  127. // It must be called before calling s.step.
  128. func (s *scanner) reset() {
  129. s.step = stateBeginValue
  130. s.parseState = s.parseState[0:0]
  131. s.err = nil
  132. s.endTop = false
  133. }
  134. // eof tells the scanner that the end of input has been reached.
  135. // It returns a scan status just as s.step does.
  136. func (s *scanner) eof() int {
  137. if s.err != nil {
  138. return scanError
  139. }
  140. if s.endTop {
  141. return scanEnd
  142. }
  143. s.step(s, ' ')
  144. if s.endTop {
  145. return scanEnd
  146. }
  147. if s.err == nil {
  148. s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
  149. }
  150. return scanError
  151. }
  152. // pushParseState pushes a new parse state p onto the parse stack.
  153. // an error state is returned if maxNestingDepth was exceeded, otherwise successState is returned.
  154. func (s *scanner) pushParseState(c byte, newParseState int, successState int) int {
  155. s.parseState = append(s.parseState, newParseState)
  156. if len(s.parseState) <= maxNestingDepth {
  157. return successState
  158. }
  159. return s.error(c, "exceeded max depth")
  160. }
  161. // popParseState pops a parse state (already obtained) off the stack
  162. // and updates s.step accordingly.
  163. func (s *scanner) popParseState() {
  164. n := len(s.parseState) - 1
  165. s.parseState = s.parseState[0:n]
  166. if n == 0 {
  167. s.step = stateEndTop
  168. s.endTop = true
  169. } else {
  170. s.step = stateEndValue
  171. }
  172. }
  173. func isSpace(c byte) bool {
  174. return c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n')
  175. }
  176. // stateBeginValueOrEmpty is the state after reading `[`.
  177. func stateBeginValueOrEmpty(s *scanner, c byte) int {
  178. if isSpace(c) {
  179. return scanSkipSpace
  180. }
  181. if c == ']' {
  182. return stateEndValue(s, c)
  183. }
  184. return stateBeginValue(s, c)
  185. }
  186. // stateBeginValue is the state at the beginning of the input.
  187. func stateBeginValue(s *scanner, c byte) int {
  188. if isSpace(c) {
  189. return scanSkipSpace
  190. }
  191. switch c {
  192. case '{':
  193. s.step = stateBeginStringOrEmpty
  194. return s.pushParseState(c, parseObjectKey, scanBeginObject)
  195. case '[':
  196. s.step = stateBeginValueOrEmpty
  197. return s.pushParseState(c, parseArrayValue, scanBeginArray)
  198. case '"':
  199. s.step = stateInString
  200. return scanBeginLiteral
  201. case '-':
  202. s.step = stateNeg
  203. return scanBeginLiteral
  204. case '0': // beginning of 0.123
  205. s.step = state0
  206. return scanBeginLiteral
  207. case 't': // beginning of true
  208. s.step = stateT
  209. return scanBeginLiteral
  210. case 'f': // beginning of false
  211. s.step = stateF
  212. return scanBeginLiteral
  213. case 'n': // beginning of null
  214. s.step = stateN
  215. return scanBeginLiteral
  216. }
  217. if '1' <= c && c <= '9' { // beginning of 1234.5
  218. s.step = state1
  219. return scanBeginLiteral
  220. }
  221. return s.error(c, "looking for beginning of value")
  222. }
  223. // stateBeginStringOrEmpty is the state after reading `{`.
  224. func stateBeginStringOrEmpty(s *scanner, c byte) int {
  225. if isSpace(c) {
  226. return scanSkipSpace
  227. }
  228. if c == '}' {
  229. n := len(s.parseState)
  230. s.parseState[n-1] = parseObjectValue
  231. return stateEndValue(s, c)
  232. }
  233. return stateBeginString(s, c)
  234. }
  235. // stateBeginString is the state after reading `{"key": value,`.
  236. func stateBeginString(s *scanner, c byte) int {
  237. if isSpace(c) {
  238. return scanSkipSpace
  239. }
  240. if c == '"' {
  241. s.step = stateInString
  242. return scanBeginLiteral
  243. }
  244. return s.error(c, "looking for beginning of object key string")
  245. }
  246. // stateEndValue is the state after completing a value,
  247. // such as after reading `{}` or `true` or `["x"`.
  248. func stateEndValue(s *scanner, c byte) int {
  249. n := len(s.parseState)
  250. if n == 0 {
  251. // Completed top-level before the current byte.
  252. s.step = stateEndTop
  253. s.endTop = true
  254. return stateEndTop(s, c)
  255. }
  256. if isSpace(c) {
  257. s.step = stateEndValue
  258. return scanSkipSpace
  259. }
  260. ps := s.parseState[n-1]
  261. switch ps {
  262. case parseObjectKey:
  263. if c == ':' {
  264. s.parseState[n-1] = parseObjectValue
  265. s.step = stateBeginValue
  266. return scanObjectKey
  267. }
  268. return s.error(c, "after object key")
  269. case parseObjectValue:
  270. if c == ',' {
  271. s.parseState[n-1] = parseObjectKey
  272. s.step = stateBeginString
  273. return scanObjectValue
  274. }
  275. if c == '}' {
  276. s.popParseState()
  277. return scanEndObject
  278. }
  279. return s.error(c, "after object key:value pair")
  280. case parseArrayValue:
  281. if c == ',' {
  282. s.step = stateBeginValue
  283. return scanArrayValue
  284. }
  285. if c == ']' {
  286. s.popParseState()
  287. return scanEndArray
  288. }
  289. return s.error(c, "after array element")
  290. }
  291. return s.error(c, "")
  292. }
  293. // stateEndTop is the state after finishing the top-level value,
  294. // such as after reading `{}` or `[1,2,3]`.
  295. // Only space characters should be seen now.
  296. func stateEndTop(s *scanner, c byte) int {
  297. if !isSpace(c) {
  298. // Complain about non-space byte on next call.
  299. s.error(c, "after top-level value")
  300. }
  301. return scanEnd
  302. }
  303. // stateInString is the state after reading `"`.
  304. func stateInString(s *scanner, c byte) int {
  305. if c == '"' {
  306. s.step = stateEndValue
  307. return scanContinue
  308. }
  309. if c == '\\' {
  310. s.step = stateInStringEsc
  311. return scanContinue
  312. }
  313. if c < 0x20 {
  314. return s.error(c, "in string literal")
  315. }
  316. return scanContinue
  317. }
  318. // stateInStringEsc is the state after reading `"\` during a quoted string.
  319. func stateInStringEsc(s *scanner, c byte) int {
  320. switch c {
  321. case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
  322. s.step = stateInString
  323. return scanContinue
  324. case 'u':
  325. s.step = stateInStringEscU
  326. return scanContinue
  327. }
  328. return s.error(c, "in string escape code")
  329. }
  330. // stateInStringEscU is the state after reading `"\u` during a quoted string.
  331. func stateInStringEscU(s *scanner, c byte) int {
  332. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  333. s.step = stateInStringEscU1
  334. return scanContinue
  335. }
  336. // numbers
  337. return s.error(c, "in \\u hexadecimal character escape")
  338. }
  339. // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
  340. func stateInStringEscU1(s *scanner, c byte) int {
  341. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  342. s.step = stateInStringEscU12
  343. return scanContinue
  344. }
  345. // numbers
  346. return s.error(c, "in \\u hexadecimal character escape")
  347. }
  348. // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
  349. func stateInStringEscU12(s *scanner, c byte) int {
  350. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  351. s.step = stateInStringEscU123
  352. return scanContinue
  353. }
  354. // numbers
  355. return s.error(c, "in \\u hexadecimal character escape")
  356. }
  357. // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
  358. func stateInStringEscU123(s *scanner, c byte) int {
  359. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  360. s.step = stateInString
  361. return scanContinue
  362. }
  363. // numbers
  364. return s.error(c, "in \\u hexadecimal character escape")
  365. }
  366. // stateNeg is the state after reading `-` during a number.
  367. func stateNeg(s *scanner, c byte) int {
  368. if c == '0' {
  369. s.step = state0
  370. return scanContinue
  371. }
  372. if '1' <= c && c <= '9' {
  373. s.step = state1
  374. return scanContinue
  375. }
  376. return s.error(c, "in numeric literal")
  377. }
  378. // state1 is the state after reading a non-zero integer during a number,
  379. // such as after reading `1` or `100` but not `0`.
  380. func state1(s *scanner, c byte) int {
  381. if '0' <= c && c <= '9' {
  382. s.step = state1
  383. return scanContinue
  384. }
  385. return state0(s, c)
  386. }
  387. // state0 is the state after reading `0` during a number.
  388. func state0(s *scanner, c byte) int {
  389. if c == '.' {
  390. s.step = stateDot
  391. return scanContinue
  392. }
  393. if c == 'e' || c == 'E' {
  394. s.step = stateE
  395. return scanContinue
  396. }
  397. return stateEndValue(s, c)
  398. }
  399. // stateDot is the state after reading the integer and decimal point in a number,
  400. // such as after reading `1.`.
  401. func stateDot(s *scanner, c byte) int {
  402. if '0' <= c && c <= '9' {
  403. s.step = stateDot0
  404. return scanContinue
  405. }
  406. return s.error(c, "after decimal point in numeric literal")
  407. }
  408. // stateDot0 is the state after reading the integer, decimal point, and subsequent
  409. // digits of a number, such as after reading `3.14`.
  410. func stateDot0(s *scanner, c byte) int {
  411. if '0' <= c && c <= '9' {
  412. return scanContinue
  413. }
  414. if c == 'e' || c == 'E' {
  415. s.step = stateE
  416. return scanContinue
  417. }
  418. return stateEndValue(s, c)
  419. }
  420. // stateE is the state after reading the mantissa and e in a number,
  421. // such as after reading `314e` or `0.314e`.
  422. func stateE(s *scanner, c byte) int {
  423. if c == '+' || c == '-' {
  424. s.step = stateESign
  425. return scanContinue
  426. }
  427. return stateESign(s, c)
  428. }
  429. // stateESign is the state after reading the mantissa, e, and sign in a number,
  430. // such as after reading `314e-` or `0.314e+`.
  431. func stateESign(s *scanner, c byte) int {
  432. if '0' <= c && c <= '9' {
  433. s.step = stateE0
  434. return scanContinue
  435. }
  436. return s.error(c, "in exponent of numeric literal")
  437. }
  438. // stateE0 is the state after reading the mantissa, e, optional sign,
  439. // and at least one digit of the exponent in a number,
  440. // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
  441. func stateE0(s *scanner, c byte) int {
  442. if '0' <= c && c <= '9' {
  443. return scanContinue
  444. }
  445. return stateEndValue(s, c)
  446. }
  447. // stateT is the state after reading `t`.
  448. func stateT(s *scanner, c byte) int {
  449. if c == 'r' {
  450. s.step = stateTr
  451. return scanContinue
  452. }
  453. return s.error(c, "in literal true (expecting 'r')")
  454. }
  455. // stateTr is the state after reading `tr`.
  456. func stateTr(s *scanner, c byte) int {
  457. if c == 'u' {
  458. s.step = stateTru
  459. return scanContinue
  460. }
  461. return s.error(c, "in literal true (expecting 'u')")
  462. }
  463. // stateTru is the state after reading `tru`.
  464. func stateTru(s *scanner, c byte) int {
  465. if c == 'e' {
  466. s.step = stateEndValue
  467. return scanContinue
  468. }
  469. return s.error(c, "in literal true (expecting 'e')")
  470. }
  471. // stateF is the state after reading `f`.
  472. func stateF(s *scanner, c byte) int {
  473. if c == 'a' {
  474. s.step = stateFa
  475. return scanContinue
  476. }
  477. return s.error(c, "in literal false (expecting 'a')")
  478. }
  479. // stateFa is the state after reading `fa`.
  480. func stateFa(s *scanner, c byte) int {
  481. if c == 'l' {
  482. s.step = stateFal
  483. return scanContinue
  484. }
  485. return s.error(c, "in literal false (expecting 'l')")
  486. }
  487. // stateFal is the state after reading `fal`.
  488. func stateFal(s *scanner, c byte) int {
  489. if c == 's' {
  490. s.step = stateFals
  491. return scanContinue
  492. }
  493. return s.error(c, "in literal false (expecting 's')")
  494. }
  495. // stateFals is the state after reading `fals`.
  496. func stateFals(s *scanner, c byte) int {
  497. if c == 'e' {
  498. s.step = stateEndValue
  499. return scanContinue
  500. }
  501. return s.error(c, "in literal false (expecting 'e')")
  502. }
  503. // stateN is the state after reading `n`.
  504. func stateN(s *scanner, c byte) int {
  505. if c == 'u' {
  506. s.step = stateNu
  507. return scanContinue
  508. }
  509. return s.error(c, "in literal null (expecting 'u')")
  510. }
  511. // stateNu is the state after reading `nu`.
  512. func stateNu(s *scanner, c byte) int {
  513. if c == 'l' {
  514. s.step = stateNul
  515. return scanContinue
  516. }
  517. return s.error(c, "in literal null (expecting 'l')")
  518. }
  519. // stateNul is the state after reading `nul`.
  520. func stateNul(s *scanner, c byte) int {
  521. if c == 'l' {
  522. s.step = stateEndValue
  523. return scanContinue
  524. }
  525. return s.error(c, "in literal null (expecting 'l')")
  526. }
  527. // stateError is the state after reaching a syntax error,
  528. // such as after reading `[1}` or `5.1.2`.
  529. func stateError(s *scanner, c byte) int {
  530. return scanError
  531. }
  532. // error records an error and switches to the error state.
  533. func (s *scanner) error(c byte, context string) int {
  534. s.step = stateError
  535. s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
  536. return scanError
  537. }
  538. // quoteChar formats c as a quoted character literal
  539. func quoteChar(c byte) string {
  540. // special cases - different from quoted strings
  541. if c == '\'' {
  542. return `'\''`
  543. }
  544. if c == '"' {
  545. return `'"'`
  546. }
  547. // use quoted string with different quotation marks
  548. s := strconv.Quote(string(c))
  549. return "'" + s[1:len(s)-1] + "'"
  550. }