json_scanner.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. // Unless explicitly stated otherwise all files in this repository are licensed
  2. // under the Apache License Version 2.0.
  3. // This product includes software developed at Datadog (https://www.datadoghq.com/).
  4. // Copyright 2016-present Datadog, Inc.
  5. // Copyright 2010 The Go Authors. All rights reserved.
  6. // Use of this source code is governed by a BSD-style
  7. // license that can be found in the LICENSE file.
  8. //
  9. // The code that follows is copied from go/src/encoding/json/scanner.go
  10. // It may contain minor edits, such as allowing multiple JSON objects within
  11. // the same input string (see stateEndTop)
  12. //
  13. package obfuscate
  14. import "strconv"
  15. // A SyntaxError is a description of a JSON syntax error.
  16. type SyntaxError struct {
  17. msg string // description of error
  18. Offset int64 // error occurred after reading Offset bytes
  19. }
  20. func (e *SyntaxError) Error() string { return e.msg }
  21. // A scanner is a JSON scanning state machine.
  22. // Callers call scan.reset() and then pass bytes in one at a time
  23. // by calling scan.step(&scan, c) for each byte.
  24. // The return value, referred to as an opcode, tells the
  25. // caller about significant parsing events like beginning
  26. // and ending literals, objects, and arrays, so that the
  27. // caller can follow along if it wishes.
  28. // The return value scanEnd indicates that a single top-level
  29. // JSON value has been completed, *before* the byte that
  30. // just got passed in. (The indication must be delayed in order
  31. // to recognize the end of numbers: is 123 a whole value or
  32. // the beginning of 12345e+6?).
  33. type scanner struct {
  34. // The step is a func to be called to execute the next transition.
  35. // Also tried using an integer constant and a single func
  36. // with a switch, but using the func directly was 10% faster
  37. // on a 64-bit Mac Mini, and it's nicer to read.
  38. step func(*scanner, byte) int
  39. // Reached end of top-level value.
  40. endTop bool
  41. // Stack of what we're in the middle of - array values, object keys, object values.
  42. parseState []int
  43. // Error that happened, if any.
  44. err error
  45. // 1-byte redo (see undo method)
  46. redo bool
  47. // total bytes consumed, updated by decoder.Decode
  48. bytes int64
  49. }
  50. // These values are returned by the state transition functions
  51. // assigned to scanner.state and the method scanner.eof.
  52. // They give details about the current state of the scan that
  53. // callers might be interested to know about.
  54. // It is okay to ignore the return value of any particular
  55. // call to scanner.state: if one call returns scanError,
  56. // every subsequent call will return scanError too.
  57. const (
  58. // Continue.
  59. scanContinue = iota // uninteresting byte
  60. scanBeginLiteral // end implied by next result != scanContinue
  61. scanBeginObject // begin object
  62. scanObjectKey // just finished object key (string)
  63. scanObjectValue // just finished non-last object value
  64. scanEndObject // end object (implies scanObjectValue if possible)
  65. scanBeginArray // begin array
  66. scanArrayValue // just finished array value
  67. scanEndArray // end array (implies scanArrayValue if possible)
  68. scanSkipSpace // space byte; can skip; known to be last "continue" result
  69. // Stop.
  70. scanEnd // top-level value ended *before* this byte; known to be first "stop" result
  71. scanError // hit an error, scanner.err.
  72. )
  73. // These values are stored in the parseState stack.
  74. // They give the current state of a composite value
  75. // being scanned. If the parser is inside a nested value
  76. // the parseState describes the nested state, outermost at entry 0.
  77. const (
  78. parseObjectKey = iota // parsing object key (before colon)
  79. parseObjectValue // parsing object value (after colon)
  80. parseArrayValue // parsing array value
  81. )
  82. // reset prepares the scanner for use.
  83. // It must be called before calling s.step.
  84. func (s *scanner) reset() {
  85. s.step = stateBeginValue
  86. s.parseState = s.parseState[0:0]
  87. s.err = nil
  88. s.redo = false
  89. s.endTop = false
  90. }
  91. // eof tells the scanner that the end of input has been reached.
  92. // It returns a scan status just as s.step does.
  93. func (s *scanner) eof() int {
  94. if s.err != nil {
  95. return scanError
  96. }
  97. if s.endTop {
  98. return scanEnd
  99. }
  100. s.step(s, ' ')
  101. if s.endTop {
  102. return scanEnd
  103. }
  104. if s.err == nil {
  105. s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
  106. }
  107. return scanError
  108. }
  109. // pushParseState pushes a new parse state p onto the parse stack.
  110. func (s *scanner) pushParseState(p int) {
  111. s.parseState = append(s.parseState, p)
  112. }
  113. // popParseState pops a parse state (already obtained) off the stack
  114. // and updates s.step accordingly.
  115. func (s *scanner) popParseState() {
  116. n := len(s.parseState) - 1
  117. s.parseState = s.parseState[0:n]
  118. s.redo = false
  119. if n == 0 {
  120. s.step = stateEndTop
  121. s.endTop = true
  122. } else {
  123. s.step = stateEndValue
  124. }
  125. }
  126. func isSpace(c byte) bool {
  127. return c == ' ' || c == '\t' || c == '\r' || c == '\n'
  128. }
  129. // stateBeginValueOrEmpty is the state after reading `[`.
  130. func stateBeginValueOrEmpty(s *scanner, c byte) int {
  131. if c <= ' ' && isSpace(c) {
  132. return scanSkipSpace
  133. }
  134. if c == ']' {
  135. return stateEndValue(s, c)
  136. }
  137. return stateBeginValue(s, c)
  138. }
  139. // stateBeginValue is the state at the beginning of the input.
  140. func stateBeginValue(s *scanner, c byte) int {
  141. if c <= ' ' && isSpace(c) {
  142. return scanSkipSpace
  143. }
  144. switch c {
  145. case '{':
  146. s.step = stateBeginStringOrEmpty
  147. s.pushParseState(parseObjectKey)
  148. return scanBeginObject
  149. case '[':
  150. s.step = stateBeginValueOrEmpty
  151. s.pushParseState(parseArrayValue)
  152. return scanBeginArray
  153. case '"':
  154. s.step = stateInString
  155. return scanBeginLiteral
  156. case '-':
  157. s.step = stateNeg
  158. return scanBeginLiteral
  159. case '0': // beginning of 0.123
  160. s.step = state0
  161. return scanBeginLiteral
  162. case 't': // beginning of true
  163. s.step = stateT
  164. return scanBeginLiteral
  165. case 'f': // beginning of false
  166. s.step = stateF
  167. return scanBeginLiteral
  168. case 'n': // beginning of null
  169. s.step = stateN
  170. return scanBeginLiteral
  171. }
  172. if '1' <= c && c <= '9' { // beginning of 1234.5
  173. s.step = state1
  174. return scanBeginLiteral
  175. }
  176. return s.error(c, "looking for beginning of value")
  177. }
  178. // stateBeginStringOrEmpty is the state after reading `{`.
  179. func stateBeginStringOrEmpty(s *scanner, c byte) int {
  180. if c <= ' ' && isSpace(c) {
  181. return scanSkipSpace
  182. }
  183. if c == '}' {
  184. n := len(s.parseState)
  185. s.parseState[n-1] = parseObjectValue
  186. return stateEndValue(s, c)
  187. }
  188. return stateBeginString(s, c)
  189. }
  190. // stateBeginString is the state after reading `{"key": value,`.
  191. func stateBeginString(s *scanner, c byte) int {
  192. if c <= ' ' && isSpace(c) {
  193. return scanSkipSpace
  194. }
  195. if c == '"' {
  196. s.step = stateInString
  197. return scanBeginLiteral
  198. }
  199. return s.error(c, "looking for beginning of object key string")
  200. }
  201. // stateEndValue is the state after completing a value,
  202. // such as after reading `{}` or `true` or `["x"`.
  203. func stateEndValue(s *scanner, c byte) int {
  204. n := len(s.parseState)
  205. if n == 0 {
  206. // Completed top-level before the current byte.
  207. s.step = stateEndTop
  208. s.endTop = true
  209. return stateEndTop(s, c)
  210. }
  211. if c <= ' ' && isSpace(c) {
  212. s.step = stateEndValue
  213. return scanSkipSpace
  214. }
  215. ps := s.parseState[n-1]
  216. switch ps {
  217. case parseObjectKey:
  218. if c == ':' {
  219. s.parseState[n-1] = parseObjectValue
  220. s.step = stateBeginValue
  221. return scanObjectKey
  222. }
  223. return s.error(c, "after object key")
  224. case parseObjectValue:
  225. if c == ',' {
  226. s.parseState[n-1] = parseObjectKey
  227. s.step = stateBeginString
  228. return scanObjectValue
  229. }
  230. if c == '}' {
  231. s.popParseState()
  232. return scanEndObject
  233. }
  234. return s.error(c, "after object key:value pair")
  235. case parseArrayValue:
  236. if c == ',' {
  237. s.step = stateBeginValue
  238. return scanArrayValue
  239. }
  240. if c == ']' {
  241. s.popParseState()
  242. return scanEndArray
  243. }
  244. return s.error(c, "after array element")
  245. }
  246. return s.error(c, "")
  247. }
  248. // stateEndTop is the state after finishing the top-level value,
  249. // such as after reading `{}` or `[1,2,3]`.
  250. // Only space characters should be seen now.
  251. func stateEndTop(s *scanner, c byte) int {
  252. if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
  253. // The former behaviour has been removed. Now, if anything
  254. // other than whitespace follows, we assume a new JSON string
  255. // might be starting. This allows us to continue obfuscating
  256. // further strings in cases where there are multiple JSON
  257. // objects enumerated sequentially within the same input.
  258. // This is a common case for ElasticSearch response bodies.
  259. s.reset()
  260. return s.step(s, c)
  261. }
  262. return scanEnd
  263. }
  264. // stateInString is the state after reading `"`.
  265. func stateInString(s *scanner, c byte) int {
  266. if c == '"' {
  267. s.step = stateEndValue
  268. return scanContinue
  269. }
  270. if c == '\\' {
  271. s.step = stateInStringEsc
  272. return scanContinue
  273. }
  274. if c < 0x20 {
  275. return s.error(c, "in string literal")
  276. }
  277. return scanContinue
  278. }
  279. // stateInStringEsc is the state after reading `"\` during a quoted string.
  280. func stateInStringEsc(s *scanner, c byte) int {
  281. switch c {
  282. case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
  283. s.step = stateInString
  284. return scanContinue
  285. case 'u':
  286. s.step = stateInStringEscU
  287. return scanContinue
  288. }
  289. return s.error(c, "in string escape code")
  290. }
  291. // stateInStringEscU is the state after reading `"\u` during a quoted string.
  292. func stateInStringEscU(s *scanner, c byte) int {
  293. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  294. s.step = stateInStringEscU1
  295. return scanContinue
  296. }
  297. // numbers
  298. return s.error(c, "in \\u hexadecimal character escape")
  299. }
  300. // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
  301. func stateInStringEscU1(s *scanner, c byte) int {
  302. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  303. s.step = stateInStringEscU12
  304. return scanContinue
  305. }
  306. // numbers
  307. return s.error(c, "in \\u hexadecimal character escape")
  308. }
  309. // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
  310. func stateInStringEscU12(s *scanner, c byte) int {
  311. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  312. s.step = stateInStringEscU123
  313. return scanContinue
  314. }
  315. // numbers
  316. return s.error(c, "in \\u hexadecimal character escape")
  317. }
  318. // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
  319. func stateInStringEscU123(s *scanner, c byte) int {
  320. if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
  321. s.step = stateInString
  322. return scanContinue
  323. }
  324. // numbers
  325. return s.error(c, "in \\u hexadecimal character escape")
  326. }
  327. // stateNeg is the state after reading `-` during a number.
  328. func stateNeg(s *scanner, c byte) int {
  329. if c == '0' {
  330. s.step = state0
  331. return scanContinue
  332. }
  333. if '1' <= c && c <= '9' {
  334. s.step = state1
  335. return scanContinue
  336. }
  337. return s.error(c, "in numeric literal")
  338. }
  339. // state1 is the state after reading a non-zero integer during a number,
  340. // such as after reading `1` or `100` but not `0`.
  341. func state1(s *scanner, c byte) int {
  342. if '0' <= c && c <= '9' {
  343. s.step = state1
  344. return scanContinue
  345. }
  346. return state0(s, c)
  347. }
  348. // state0 is the state after reading `0` during a number.
  349. func state0(s *scanner, c byte) int {
  350. if c == '.' {
  351. s.step = stateDot
  352. return scanContinue
  353. }
  354. if c == 'e' || c == 'E' {
  355. s.step = stateE
  356. return scanContinue
  357. }
  358. return stateEndValue(s, c)
  359. }
  360. // stateDot is the state after reading the integer and decimal point in a number,
  361. // such as after reading `1.`.
  362. func stateDot(s *scanner, c byte) int {
  363. if '0' <= c && c <= '9' {
  364. s.step = stateDot0
  365. return scanContinue
  366. }
  367. return s.error(c, "after decimal point in numeric literal")
  368. }
  369. // stateDot0 is the state after reading the integer, decimal point, and subsequent
  370. // digits of a number, such as after reading `3.14`.
  371. func stateDot0(s *scanner, c byte) int {
  372. if '0' <= c && c <= '9' {
  373. return scanContinue
  374. }
  375. if c == 'e' || c == 'E' {
  376. s.step = stateE
  377. return scanContinue
  378. }
  379. return stateEndValue(s, c)
  380. }
  381. // stateE is the state after reading the mantissa and e in a number,
  382. // such as after reading `314e` or `0.314e`.
  383. func stateE(s *scanner, c byte) int {
  384. if c == '+' || c == '-' {
  385. s.step = stateESign
  386. return scanContinue
  387. }
  388. return stateESign(s, c)
  389. }
  390. // stateESign is the state after reading the mantissa, e, and sign in a number,
  391. // such as after reading `314e-` or `0.314e+`.
  392. func stateESign(s *scanner, c byte) int {
  393. if '0' <= c && c <= '9' {
  394. s.step = stateE0
  395. return scanContinue
  396. }
  397. return s.error(c, "in exponent of numeric literal")
  398. }
  399. // stateE0 is the state after reading the mantissa, e, optional sign,
  400. // and at least one digit of the exponent in a number,
  401. // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
  402. func stateE0(s *scanner, c byte) int {
  403. if '0' <= c && c <= '9' {
  404. return scanContinue
  405. }
  406. return stateEndValue(s, c)
  407. }
  408. // stateT is the state after reading `t`.
  409. func stateT(s *scanner, c byte) int {
  410. if c == 'r' {
  411. s.step = stateTr
  412. return scanContinue
  413. }
  414. return s.error(c, "in literal true (expecting 'r')")
  415. }
  416. // stateTr is the state after reading `tr`.
  417. func stateTr(s *scanner, c byte) int {
  418. if c == 'u' {
  419. s.step = stateTru
  420. return scanContinue
  421. }
  422. return s.error(c, "in literal true (expecting 'u')")
  423. }
  424. // stateTru is the state after reading `tru`.
  425. func stateTru(s *scanner, c byte) int {
  426. if c == 'e' {
  427. s.step = stateEndValue
  428. return scanContinue
  429. }
  430. return s.error(c, "in literal true (expecting 'e')")
  431. }
  432. // stateF is the state after reading `f`.
  433. func stateF(s *scanner, c byte) int {
  434. if c == 'a' {
  435. s.step = stateFa
  436. return scanContinue
  437. }
  438. return s.error(c, "in literal false (expecting 'a')")
  439. }
  440. // stateFa is the state after reading `fa`.
  441. func stateFa(s *scanner, c byte) int {
  442. if c == 'l' {
  443. s.step = stateFal
  444. return scanContinue
  445. }
  446. return s.error(c, "in literal false (expecting 'l')")
  447. }
  448. // stateFal is the state after reading `fal`.
  449. func stateFal(s *scanner, c byte) int {
  450. if c == 's' {
  451. s.step = stateFals
  452. return scanContinue
  453. }
  454. return s.error(c, "in literal false (expecting 's')")
  455. }
  456. // stateFals is the state after reading `fals`.
  457. func stateFals(s *scanner, c byte) int {
  458. if c == 'e' {
  459. s.step = stateEndValue
  460. return scanContinue
  461. }
  462. return s.error(c, "in literal false (expecting 'e')")
  463. }
  464. // stateN is the state after reading `n`.
  465. func stateN(s *scanner, c byte) int {
  466. if c == 'u' {
  467. s.step = stateNu
  468. return scanContinue
  469. }
  470. return s.error(c, "in literal null (expecting 'u')")
  471. }
  472. // stateNu is the state after reading `nu`.
  473. func stateNu(s *scanner, c byte) int {
  474. if c == 'l' {
  475. s.step = stateNul
  476. return scanContinue
  477. }
  478. return s.error(c, "in literal null (expecting 'l')")
  479. }
  480. // stateNul is the state after reading `nul`.
  481. func stateNul(s *scanner, c byte) int {
  482. if c == 'l' {
  483. s.step = stateEndValue
  484. return scanContinue
  485. }
  486. return s.error(c, "in literal null (expecting 'l')")
  487. }
  488. // stateError is the state after reaching a syntax error,
  489. // such as after reading `[1}` or `5.1.2`.
  490. func stateError(s *scanner, c byte) int {
  491. return scanError
  492. }
  493. // error records an error and switches to the error state.
  494. func (s *scanner) error(c byte, context string) int {
  495. s.step = stateError
  496. s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
  497. return scanError
  498. }
  499. // quoteChar formats c as a quoted character literal
  500. func quoteChar(c byte) string {
  501. // special cases - different from quoted strings
  502. if c == '\'' {
  503. return `'\''`
  504. }
  505. if c == '"' {
  506. return `'"'`
  507. }
  508. // use quoted string with different quotation marks
  509. s := strconv.Quote(string(c))
  510. return "'" + s[1:len(s)-1] + "'"
  511. }