xml.go 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056
  1. // Copyright 2009 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package xml implements a simple XML 1.0 parser that
  5. // understands XML name spaces.
  6. package xml
  7. // References:
  8. // Annotated XML spec: https://www.xml.com/axml/testaxml.htm
  9. // XML name spaces: https://www.w3.org/TR/REC-xml-names/
  10. // TODO(rsc):
  11. // Test error handling.
  12. import (
  13. "bufio"
  14. "bytes"
  15. "errors"
  16. "fmt"
  17. "io"
  18. "reflect"
  19. "strconv"
  20. "strings"
  21. "unicode"
  22. "unicode/utf8"
  23. )
  24. // A SyntaxError represents a syntax error in the XML input stream.
  25. type SyntaxError struct {
  26. Msg string
  27. Line int
  28. }
  29. func (e *SyntaxError) Error() string {
  30. return "XML syntax error on line " + strconv.Itoa(e.Line) + ": " + e.Msg
  31. }
  32. // A Name represents an XML name (Local) annotated
  33. // with a name space identifier (Space).
  34. // In tokens returned by Decoder.Token, the Space identifier
  35. // is given as a canonical URL, not the short prefix used
  36. // in the document being parsed.
  37. type Name struct {
  38. Space, Local string
  39. }
  40. // An Attr represents an attribute in an XML element (Name=Value).
  41. type Attr struct {
  42. Name Name
  43. Value string
  44. }
  45. // A Token is an interface holding one of the token types:
  46. // StartElement, EndElement, CharData, Comment, ProcInst, or Directive.
  47. type Token interface{}
  48. // A StartElement represents an XML start element.
  49. type StartElement struct {
  50. Name Name
  51. Attr []Attr
  52. }
  53. // Copy creates a new copy of StartElement.
  54. func (e StartElement) Copy() StartElement {
  55. attrs := make([]Attr, len(e.Attr))
  56. copy(attrs, e.Attr)
  57. e.Attr = attrs
  58. return e
  59. }
  60. // End returns the corresponding XML end element.
  61. func (e StartElement) End() EndElement {
  62. return EndElement{e.Name}
  63. }
  64. // An EndElement represents an XML end element.
  65. type EndElement struct {
  66. Name Name
  67. }
  68. // A CharData represents XML character data (raw text),
  69. // in which XML escape sequences have been replaced by
  70. // the characters they represent.
  71. type CharData []byte
  72. func makeCopy(b []byte) []byte {
  73. b1 := make([]byte, len(b))
  74. copy(b1, b)
  75. return b1
  76. }
  77. // Copy creates a new copy of CharData.
  78. func (c CharData) Copy() CharData { return CharData(makeCopy(c)) }
  79. // A Comment represents an XML comment of the form <!--comment-->.
  80. // The bytes do not include the <!-- and --> comment markers.
  81. type Comment []byte
  82. // Copy creates a new copy of Comment.
  83. func (c Comment) Copy() Comment { return Comment(makeCopy(c)) }
  84. // A ProcInst represents an XML processing instruction of the form <?target inst?>
  85. type ProcInst struct {
  86. Target string
  87. Inst []byte
  88. }
  89. // Copy creates a new copy of ProcInst.
  90. func (p ProcInst) Copy() ProcInst {
  91. p.Inst = makeCopy(p.Inst)
  92. return p
  93. }
  94. // A Directive represents an XML directive of the form <!text>.
  95. // The bytes do not include the <! and > markers.
  96. type Directive []byte
  97. // Copy creates a new copy of Directive.
  98. func (d Directive) Copy() Directive { return Directive(makeCopy(d)) }
  99. // CopyToken returns a copy of a Token.
  100. func CopyToken(t Token) Token {
  101. switch v := t.(type) {
  102. case CharData:
  103. return v.Copy()
  104. case Comment:
  105. return v.Copy()
  106. case Directive:
  107. return v.Copy()
  108. case ProcInst:
  109. return v.Copy()
  110. case StartElement:
  111. return v.Copy()
  112. }
  113. return t
  114. }
  115. // A TokenReader is anything that can decode a stream of XML tokens, including a
  116. // Decoder.
  117. //
  118. // When Token encounters an error or end-of-file condition after successfully
  119. // reading a token, it returns the token. It may return the (non-nil) error from
  120. // the same call or return the error (and a nil token) from a subsequent call.
  121. // An instance of this general case is that a TokenReader returning a non-nil
  122. // token at the end of the token stream may return either io.EOF or a nil error.
  123. // The next Read should return nil, io.EOF.
  124. //
  125. // Implementations of Token are discouraged from returning a nil token with a
  126. // nil error. Callers should treat a return of nil, nil as indicating that
  127. // nothing happened; in particular it does not indicate EOF.
  128. type TokenReader interface {
  129. Token() (Token, error)
  130. }
  131. // A Decoder represents an XML parser reading a particular input stream.
  132. // The parser assumes that its input is encoded in UTF-8.
  133. type Decoder struct {
  134. // Strict defaults to true, enforcing the requirements
  135. // of the XML specification.
  136. // If set to false, the parser allows input containing common
  137. // mistakes:
  138. // * If an element is missing an end tag, the parser invents
  139. // end tags as necessary to keep the return values from Token
  140. // properly balanced.
  141. // * In attribute values and character data, unknown or malformed
  142. // character entities (sequences beginning with &) are left alone.
  143. //
  144. // Setting:
  145. //
  146. // d.Strict = false
  147. // d.AutoClose = xml.HTMLAutoClose
  148. // d.Entity = xml.HTMLEntity
  149. //
  150. // creates a parser that can handle typical HTML.
  151. //
  152. // Strict mode does not enforce the requirements of the XML name spaces TR.
  153. // In particular it does not reject name space tags using undefined prefixes.
  154. // Such tags are recorded with the unknown prefix as the name space URL.
  155. Strict bool
  156. // When Strict == false, AutoClose indicates a set of elements to
  157. // consider closed immediately after they are opened, regardless
  158. // of whether an end element is present.
  159. AutoClose []string
  160. // Entity can be used to map non-standard entity names to string replacements.
  161. // The parser behaves as if these standard mappings are present in the map,
  162. // regardless of the actual map content:
  163. //
  164. // "lt": "<",
  165. // "gt": ">",
  166. // "amp": "&",
  167. // "apos": "'",
  168. // "quot": `"`,
  169. Entity map[string]string
  170. // CharsetReader, if non-nil, defines a function to generate
  171. // charset-conversion readers, converting from the provided
  172. // non-UTF-8 charset into UTF-8. If CharsetReader is nil or
  173. // returns an error, parsing stops with an error. One of the
  174. // CharsetReader's result values must be non-nil.
  175. CharsetReader func(charset string, input io.Reader) (io.Reader, error)
  176. // DefaultSpace sets the default name space used for unadorned tags,
  177. // as if the entire XML stream were wrapped in an element containing
  178. // the attribute xmlns="DefaultSpace".
  179. DefaultSpace string
  180. // TypeFunc is used to map type names to actual types.
  181. TypeFunc func(string) (reflect.Type, bool)
  182. r io.ByteReader
  183. t TokenReader
  184. buf bytes.Buffer
  185. saved *bytes.Buffer
  186. stk *stack
  187. free *stack
  188. needClose bool
  189. toClose Name
  190. nextToken Token
  191. nextByte int
  192. ns map[string]string
  193. err error
  194. line int
  195. offset int64
  196. unmarshalDepth int
  197. }
  198. // NewDecoder creates a new XML parser reading from r.
  199. // If r does not implement io.ByteReader, NewDecoder will
  200. // do its own buffering.
  201. func NewDecoder(r io.Reader) *Decoder {
  202. d := &Decoder{
  203. ns: make(map[string]string),
  204. nextByte: -1,
  205. line: 1,
  206. Strict: true,
  207. }
  208. d.switchToReader(r)
  209. return d
  210. }
  211. // NewTokenDecoder creates a new XML parser using an underlying token stream.
  212. func NewTokenDecoder(t TokenReader) *Decoder {
  213. // Is it already a Decoder?
  214. if d, ok := t.(*Decoder); ok {
  215. return d
  216. }
  217. d := &Decoder{
  218. ns: make(map[string]string),
  219. t: t,
  220. nextByte: -1,
  221. line: 1,
  222. Strict: true,
  223. }
  224. return d
  225. }
  226. // Token returns the next XML token in the input stream.
  227. // At the end of the input stream, Token returns nil, io.EOF.
  228. //
  229. // Slices of bytes in the returned token data refer to the
  230. // parser's internal buffer and remain valid only until the next
  231. // call to Token. To acquire a copy of the bytes, call CopyToken
  232. // or the token's Copy method.
  233. //
  234. // Token expands self-closing elements such as <br/>
  235. // into separate start and end elements returned by successive calls.
  236. //
  237. // Token guarantees that the StartElement and EndElement
  238. // tokens it returns are properly nested and matched:
  239. // if Token encounters an unexpected end element
  240. // or EOF before all expected end elements,
  241. // it will return an error.
  242. //
  243. // Token implements XML name spaces as described by
  244. // https://www.w3.org/TR/REC-xml-names/. Each of the
  245. // Name structures contained in the Token has the Space
  246. // set to the URL identifying its name space when known.
  247. // If Token encounters an unrecognized name space prefix,
  248. // it uses the prefix as the Space rather than report an error.
  249. func (d *Decoder) Token() (Token, error) {
  250. var t Token
  251. var err error
  252. if d.stk != nil && d.stk.kind == stkEOF {
  253. return nil, io.EOF
  254. }
  255. if d.nextToken != nil {
  256. t = d.nextToken
  257. d.nextToken = nil
  258. } else if t, err = d.rawToken(); err != nil {
  259. switch {
  260. case err == io.EOF && d.t != nil:
  261. err = nil
  262. case err == io.EOF && d.stk != nil && d.stk.kind != stkEOF:
  263. err = d.syntaxError("unexpected EOF")
  264. }
  265. return t, err
  266. }
  267. if !d.Strict {
  268. if t1, ok := d.autoClose(t); ok {
  269. d.nextToken = t
  270. t = t1
  271. }
  272. }
  273. switch t1 := t.(type) {
  274. case StartElement:
  275. // In XML name spaces, the translations listed in the
  276. // attributes apply to the element name and
  277. // to the other attribute names, so process
  278. // the translations first.
  279. for _, a := range t1.Attr {
  280. if a.Name.Space == xmlnsPrefix {
  281. v, ok := d.ns[a.Name.Local]
  282. d.pushNs(a.Name.Local, v, ok)
  283. d.ns[a.Name.Local] = a.Value
  284. }
  285. if a.Name.Space == "" && a.Name.Local == xmlnsPrefix {
  286. // Default space for untagged names
  287. v, ok := d.ns[""]
  288. d.pushNs("", v, ok)
  289. d.ns[""] = a.Value
  290. }
  291. }
  292. d.translate(&t1.Name, true)
  293. for i := range t1.Attr {
  294. d.translate(&t1.Attr[i].Name, false)
  295. }
  296. d.pushElement(t1.Name)
  297. t = t1
  298. case EndElement:
  299. d.translate(&t1.Name, true)
  300. if !d.popElement(&t1) {
  301. return nil, d.err
  302. }
  303. t = t1
  304. }
  305. return t, err
  306. }
  307. const (
  308. xmlURL = "http://www.w3.org/XML/1998/namespace"
  309. xmlnsPrefix = "xmlns"
  310. xmlPrefix = "xml"
  311. )
  312. // Apply name space translation to name n.
  313. // The default name space (for Space=="")
  314. // applies only to element names, not to attribute names.
  315. func (d *Decoder) translate(n *Name, isElementName bool) {
  316. switch {
  317. case n.Space == xmlnsPrefix:
  318. return
  319. case n.Space == "" && !isElementName:
  320. return
  321. case n.Space == xmlPrefix:
  322. n.Space = xmlURL
  323. case n.Space == "" && n.Local == xmlnsPrefix:
  324. return
  325. }
  326. if v, ok := d.ns[n.Space]; ok {
  327. n.Space = v
  328. } else if n.Space == "" {
  329. n.Space = d.DefaultSpace
  330. }
  331. }
  332. func (d *Decoder) switchToReader(r io.Reader) {
  333. // Get efficient byte at a time reader.
  334. // Assume that if reader has its own
  335. // ReadByte, it's efficient enough.
  336. // Otherwise, use bufio.
  337. if rb, ok := r.(io.ByteReader); ok {
  338. d.r = rb
  339. } else {
  340. d.r = bufio.NewReader(r)
  341. }
  342. }
  343. // Parsing state - stack holds old name space translations
  344. // and the current set of open elements. The translations to pop when
  345. // ending a given tag are *below* it on the stack, which is
  346. // more work but forced on us by XML.
  347. type stack struct {
  348. next *stack
  349. kind int
  350. name Name
  351. ok bool
  352. }
  353. const (
  354. stkStart = iota
  355. stkNs
  356. stkEOF
  357. )
  358. func (d *Decoder) push(kind int) *stack {
  359. s := d.free
  360. if s != nil {
  361. d.free = s.next
  362. } else {
  363. s = new(stack)
  364. }
  365. s.next = d.stk
  366. s.kind = kind
  367. d.stk = s
  368. return s
  369. }
  370. func (d *Decoder) pop() *stack {
  371. s := d.stk
  372. if s != nil {
  373. d.stk = s.next
  374. s.next = d.free
  375. d.free = s
  376. }
  377. return s
  378. }
  379. // Record that after the current element is finished
  380. // (that element is already pushed on the stack)
  381. // Token should return EOF until popEOF is called.
  382. func (d *Decoder) pushEOF() {
  383. // Walk down stack to find Start.
  384. // It might not be the top, because there might be stkNs
  385. // entries above it.
  386. start := d.stk
  387. for start.kind != stkStart {
  388. start = start.next
  389. }
  390. // The stkNs entries below a start are associated with that
  391. // element too; skip over them.
  392. for start.next != nil && start.next.kind == stkNs {
  393. start = start.next
  394. }
  395. s := d.free
  396. if s != nil {
  397. d.free = s.next
  398. } else {
  399. s = new(stack)
  400. }
  401. s.kind = stkEOF
  402. s.next = start.next
  403. start.next = s
  404. }
  405. // Undo a pushEOF.
  406. // The element must have been finished, so the EOF should be at the top of the stack.
  407. func (d *Decoder) popEOF() bool {
  408. if d.stk == nil || d.stk.kind != stkEOF {
  409. return false
  410. }
  411. d.pop()
  412. return true
  413. }
  414. // Record that we are starting an element with the given name.
  415. func (d *Decoder) pushElement(name Name) {
  416. s := d.push(stkStart)
  417. s.name = name
  418. }
  419. // Record that we are changing the value of ns[local].
  420. // The old value is url, ok.
  421. func (d *Decoder) pushNs(local string, url string, ok bool) {
  422. s := d.push(stkNs)
  423. s.name.Local = local
  424. s.name.Space = url
  425. s.ok = ok
  426. }
  427. // Creates a SyntaxError with the current line number.
  428. func (d *Decoder) syntaxError(msg string) error {
  429. return &SyntaxError{Msg: msg, Line: d.line}
  430. }
  431. // Record that we are ending an element with the given name.
  432. // The name must match the record at the top of the stack,
  433. // which must be a pushElement record.
  434. // After popping the element, apply any undo records from
  435. // the stack to restore the name translations that existed
  436. // before we saw this element.
  437. func (d *Decoder) popElement(t *EndElement) bool {
  438. s := d.pop()
  439. name := t.Name
  440. switch {
  441. case s == nil || s.kind != stkStart:
  442. d.err = d.syntaxError("unexpected end element </" + name.Local + ">")
  443. return false
  444. case s.name.Local != name.Local:
  445. if !d.Strict {
  446. d.needClose = true
  447. d.toClose = t.Name
  448. t.Name = s.name
  449. return true
  450. }
  451. d.err = d.syntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">")
  452. return false
  453. case s.name.Space != name.Space:
  454. d.err = d.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space +
  455. "closed by </" + name.Local + "> in space " + name.Space)
  456. return false
  457. }
  458. // Pop stack until a Start or EOF is on the top, undoing the
  459. // translations that were associated with the element we just closed.
  460. for d.stk != nil && d.stk.kind != stkStart && d.stk.kind != stkEOF {
  461. s := d.pop()
  462. if s.ok {
  463. d.ns[s.name.Local] = s.name.Space
  464. } else {
  465. delete(d.ns, s.name.Local)
  466. }
  467. }
  468. return true
  469. }
  470. // If the top element on the stack is autoclosing and
  471. // t is not the end tag, invent the end tag.
  472. func (d *Decoder) autoClose(t Token) (Token, bool) {
  473. if d.stk == nil || d.stk.kind != stkStart {
  474. return nil, false
  475. }
  476. name := strings.ToLower(d.stk.name.Local)
  477. for _, s := range d.AutoClose {
  478. if strings.ToLower(s) == name {
  479. // This one should be auto closed if t doesn't close it.
  480. et, ok := t.(EndElement)
  481. if !ok || et.Name.Local != name {
  482. return EndElement{d.stk.name}, true
  483. }
  484. break
  485. }
  486. }
  487. return nil, false
  488. }
  489. var errRawToken = errors.New("xml: cannot use RawToken from UnmarshalXML method")
  490. // RawToken is like Token but does not verify that
  491. // start and end elements match and does not translate
  492. // name space prefixes to their corresponding URLs.
  493. func (d *Decoder) RawToken() (Token, error) {
  494. if d.unmarshalDepth > 0 {
  495. return nil, errRawToken
  496. }
  497. return d.rawToken()
  498. }
  499. func (d *Decoder) rawToken() (Token, error) {
  500. if d.t != nil {
  501. return d.t.Token()
  502. }
  503. if d.err != nil {
  504. return nil, d.err
  505. }
  506. if d.needClose {
  507. // The last element we read was self-closing and
  508. // we returned just the StartElement half.
  509. // Return the EndElement half now.
  510. d.needClose = false
  511. return EndElement{d.toClose}, nil
  512. }
  513. b, ok := d.getc()
  514. if !ok {
  515. return nil, d.err
  516. }
  517. if b != '<' {
  518. // Text section.
  519. d.ungetc(b)
  520. data := d.text(-1, false)
  521. if data == nil {
  522. return nil, d.err
  523. }
  524. return CharData(data), nil
  525. }
  526. if b, ok = d.mustgetc(); !ok {
  527. return nil, d.err
  528. }
  529. switch b {
  530. case '/':
  531. // </: End element
  532. var name Name
  533. if name, ok = d.nsname(); !ok {
  534. if d.err == nil {
  535. d.err = d.syntaxError("expected element name after </")
  536. }
  537. return nil, d.err
  538. }
  539. d.space()
  540. if b, ok = d.mustgetc(); !ok {
  541. return nil, d.err
  542. }
  543. if b != '>' {
  544. d.err = d.syntaxError("invalid characters between </" + name.Local + " and >")
  545. return nil, d.err
  546. }
  547. return EndElement{name}, nil
  548. case '?':
  549. // <?: Processing instruction.
  550. var target string
  551. if target, ok = d.name(); !ok {
  552. if d.err == nil {
  553. d.err = d.syntaxError("expected target name after <?")
  554. }
  555. return nil, d.err
  556. }
  557. d.space()
  558. d.buf.Reset()
  559. var b0 byte
  560. for {
  561. if b, ok = d.mustgetc(); !ok {
  562. return nil, d.err
  563. }
  564. d.buf.WriteByte(b)
  565. if b0 == '?' && b == '>' {
  566. break
  567. }
  568. b0 = b
  569. }
  570. data := d.buf.Bytes()
  571. data = data[0 : len(data)-2] // chop ?>
  572. if target == "xml" {
  573. content := string(data)
  574. ver := procInst("version", content)
  575. if ver != "" && ver != "1.0" {
  576. d.err = fmt.Errorf("xml: unsupported version %q; only version 1.0 is supported", ver)
  577. return nil, d.err
  578. }
  579. enc := procInst("encoding", content)
  580. if enc != "" && enc != "utf-8" && enc != "UTF-8" && !strings.EqualFold(enc, "utf-8") {
  581. if d.CharsetReader == nil {
  582. d.err = fmt.Errorf("xml: encoding %q declared but Decoder.CharsetReader is nil", enc)
  583. return nil, d.err
  584. }
  585. newr, err := d.CharsetReader(enc, d.r.(io.Reader))
  586. if err != nil {
  587. d.err = fmt.Errorf("xml: opening charset %q: %v", enc, err)
  588. return nil, d.err
  589. }
  590. if newr == nil {
  591. panic("CharsetReader returned a nil Reader for charset " + enc)
  592. }
  593. d.switchToReader(newr)
  594. }
  595. }
  596. return ProcInst{target, data}, nil
  597. case '!':
  598. // <!: Maybe comment, maybe CDATA.
  599. if b, ok = d.mustgetc(); !ok {
  600. return nil, d.err
  601. }
  602. switch b {
  603. case '-': // <!-
  604. // Probably <!-- for a comment.
  605. if b, ok = d.mustgetc(); !ok {
  606. return nil, d.err
  607. }
  608. if b != '-' {
  609. d.err = d.syntaxError("invalid sequence <!- not part of <!--")
  610. return nil, d.err
  611. }
  612. // Look for terminator.
  613. d.buf.Reset()
  614. var b0, b1 byte
  615. for {
  616. if b, ok = d.mustgetc(); !ok {
  617. return nil, d.err
  618. }
  619. d.buf.WriteByte(b)
  620. if b0 == '-' && b1 == '-' {
  621. if b != '>' {
  622. d.err = d.syntaxError(
  623. `invalid sequence "--" not allowed in comments`)
  624. return nil, d.err
  625. }
  626. break
  627. }
  628. b0, b1 = b1, b
  629. }
  630. data := d.buf.Bytes()
  631. data = data[0 : len(data)-3] // chop -->
  632. return Comment(data), nil
  633. case '[': // <![
  634. // Probably <![CDATA[.
  635. for i := 0; i < 6; i++ {
  636. if b, ok = d.mustgetc(); !ok {
  637. return nil, d.err
  638. }
  639. if b != "CDATA["[i] {
  640. d.err = d.syntaxError("invalid <![ sequence")
  641. return nil, d.err
  642. }
  643. }
  644. // Have <![CDATA[. Read text until ]]>.
  645. data := d.text(-1, true)
  646. if data == nil {
  647. return nil, d.err
  648. }
  649. return CharData(data), nil
  650. }
  651. // Probably a directive: <!DOCTYPE ...>, <!ENTITY ...>, etc.
  652. // We don't care, but accumulate for caller. Quoted angle
  653. // brackets do not count for nesting.
  654. d.buf.Reset()
  655. d.buf.WriteByte(b)
  656. inquote := uint8(0)
  657. depth := 0
  658. for {
  659. if b, ok = d.mustgetc(); !ok {
  660. return nil, d.err
  661. }
  662. if inquote == 0 && b == '>' && depth == 0 {
  663. break
  664. }
  665. HandleB:
  666. d.buf.WriteByte(b)
  667. switch {
  668. case b == inquote:
  669. inquote = 0
  670. case inquote != 0:
  671. // in quotes, no special action
  672. case b == '\'' || b == '"':
  673. inquote = b
  674. case b == '>' && inquote == 0:
  675. depth--
  676. case b == '<' && inquote == 0:
  677. // Look for <!-- to begin comment.
  678. s := "!--"
  679. for i := 0; i < len(s); i++ {
  680. if b, ok = d.mustgetc(); !ok {
  681. return nil, d.err
  682. }
  683. if b != s[i] {
  684. for j := 0; j < i; j++ {
  685. d.buf.WriteByte(s[j])
  686. }
  687. depth++
  688. goto HandleB
  689. }
  690. }
  691. // Remove < that was written above.
  692. d.buf.Truncate(d.buf.Len() - 1)
  693. // Look for terminator.
  694. var b0, b1 byte
  695. for {
  696. if b, ok = d.mustgetc(); !ok {
  697. return nil, d.err
  698. }
  699. if b0 == '-' && b1 == '-' && b == '>' {
  700. break
  701. }
  702. b0, b1 = b1, b
  703. }
  704. }
  705. }
  706. return Directive(d.buf.Bytes()), nil
  707. }
  708. // Must be an open element like <a href="foo">
  709. d.ungetc(b)
  710. var (
  711. name Name
  712. empty bool
  713. attr []Attr
  714. )
  715. if name, ok = d.nsname(); !ok {
  716. if d.err == nil {
  717. d.err = d.syntaxError("expected element name after <")
  718. }
  719. return nil, d.err
  720. }
  721. attr = []Attr{}
  722. for {
  723. d.space()
  724. if b, ok = d.mustgetc(); !ok {
  725. return nil, d.err
  726. }
  727. if b == '/' {
  728. empty = true
  729. if b, ok = d.mustgetc(); !ok {
  730. return nil, d.err
  731. }
  732. if b != '>' {
  733. d.err = d.syntaxError("expected /> in element")
  734. return nil, d.err
  735. }
  736. break
  737. }
  738. if b == '>' {
  739. break
  740. }
  741. d.ungetc(b)
  742. a := Attr{}
  743. if a.Name, ok = d.nsname(); !ok {
  744. if d.err == nil {
  745. d.err = d.syntaxError("expected attribute name in element")
  746. }
  747. return nil, d.err
  748. }
  749. d.space()
  750. if b, ok = d.mustgetc(); !ok {
  751. return nil, d.err
  752. }
  753. if b != '=' {
  754. if d.Strict {
  755. d.err = d.syntaxError("attribute name without = in element")
  756. return nil, d.err
  757. }
  758. d.ungetc(b)
  759. a.Value = a.Name.Local
  760. } else {
  761. d.space()
  762. data := d.attrval()
  763. if data == nil {
  764. return nil, d.err
  765. }
  766. a.Value = string(data)
  767. }
  768. attr = append(attr, a)
  769. }
  770. if empty {
  771. d.needClose = true
  772. d.toClose = name
  773. }
  774. return StartElement{name, attr}, nil
  775. }
  776. func (d *Decoder) attrval() []byte {
  777. b, ok := d.mustgetc()
  778. if !ok {
  779. return nil
  780. }
  781. // Handle quoted attribute values
  782. if b == '"' || b == '\'' {
  783. return d.text(int(b), false)
  784. }
  785. // Handle unquoted attribute values for strict parsers
  786. if d.Strict {
  787. d.err = d.syntaxError("unquoted or missing attribute value in element")
  788. return nil
  789. }
  790. // Handle unquoted attribute values for unstrict parsers
  791. d.ungetc(b)
  792. d.buf.Reset()
  793. for {
  794. b, ok = d.mustgetc()
  795. if !ok {
  796. return nil
  797. }
  798. // https://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.2
  799. if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' ||
  800. '0' <= b && b <= '9' || b == '_' || b == ':' || b == '-' {
  801. d.buf.WriteByte(b)
  802. } else {
  803. d.ungetc(b)
  804. break
  805. }
  806. }
  807. return d.buf.Bytes()
  808. }
  809. // Skip spaces if any
  810. func (d *Decoder) space() {
  811. for {
  812. b, ok := d.getc()
  813. if !ok {
  814. return
  815. }
  816. switch b {
  817. case ' ', '\r', '\n', '\t':
  818. default:
  819. d.ungetc(b)
  820. return
  821. }
  822. }
  823. }
  824. // Read a single byte.
  825. // If there is no byte to read, return ok==false
  826. // and leave the error in d.err.
  827. // Maintain line number.
  828. func (d *Decoder) getc() (b byte, ok bool) {
  829. if d.err != nil {
  830. return 0, false
  831. }
  832. if d.nextByte >= 0 {
  833. b = byte(d.nextByte)
  834. d.nextByte = -1
  835. } else {
  836. b, d.err = d.r.ReadByte()
  837. if d.err != nil {
  838. return 0, false
  839. }
  840. if d.saved != nil {
  841. d.saved.WriteByte(b)
  842. }
  843. }
  844. if b == '\n' {
  845. d.line++
  846. }
  847. d.offset++
  848. return b, true
  849. }
  850. // InputOffset returns the input stream byte offset of the current decoder position.
  851. // The offset gives the location of the end of the most recently returned token
  852. // and the beginning of the next token.
  853. func (d *Decoder) InputOffset() int64 {
  854. return d.offset
  855. }
  856. // Return saved offset.
  857. // If we did ungetc (nextByte >= 0), have to back up one.
  858. func (d *Decoder) savedOffset() int {
  859. n := d.saved.Len()
  860. if d.nextByte >= 0 {
  861. n--
  862. }
  863. return n
  864. }
  865. // Must read a single byte.
  866. // If there is no byte to read,
  867. // set d.err to SyntaxError("unexpected EOF")
  868. // and return ok==false
  869. func (d *Decoder) mustgetc() (b byte, ok bool) {
  870. if b, ok = d.getc(); !ok {
  871. if d.err == io.EOF {
  872. d.err = d.syntaxError("unexpected EOF")
  873. }
  874. }
  875. return
  876. }
  877. // Unread a single byte.
  878. func (d *Decoder) ungetc(b byte) {
  879. if b == '\n' {
  880. d.line--
  881. }
  882. d.nextByte = int(b)
  883. d.offset--
  884. }
  885. var entity = map[string]rune{
  886. "lt": '<',
  887. "gt": '>',
  888. "amp": '&',
  889. "apos": '\'',
  890. "quot": '"',
  891. }
  892. // Read plain text section (XML calls it character data).
  893. // If quote >= 0, we are in a quoted string and need to find the matching quote.
  894. // If cdata == true, we are in a <![CDATA[ section and need to find ]]>.
  895. // On failure return nil and leave the error in d.err.
  896. func (d *Decoder) text(quote int, cdata bool) []byte {
  897. var b0, b1 byte
  898. var trunc int
  899. d.buf.Reset()
  900. Input:
  901. for {
  902. b, ok := d.getc()
  903. if !ok {
  904. if cdata {
  905. if d.err == io.EOF {
  906. d.err = d.syntaxError("unexpected EOF in CDATA section")
  907. }
  908. return nil
  909. }
  910. break Input
  911. }
  912. // <![CDATA[ section ends with ]]>.
  913. // It is an error for ]]> to appear in ordinary text.
  914. if b0 == ']' && b1 == ']' && b == '>' {
  915. if cdata {
  916. trunc = 2
  917. break Input
  918. }
  919. d.err = d.syntaxError("unescaped ]]> not in CDATA section")
  920. return nil
  921. }
  922. // Stop reading text if we see a <.
  923. if b == '<' && !cdata {
  924. if quote >= 0 {
  925. d.err = d.syntaxError("unescaped < inside quoted string")
  926. return nil
  927. }
  928. d.ungetc('<')
  929. break Input
  930. }
  931. if quote >= 0 && b == byte(quote) {
  932. break Input
  933. }
  934. if b == '&' && !cdata {
  935. // Read escaped character expression up to semicolon.
  936. // XML in all its glory allows a document to define and use
  937. // its own character names with <!ENTITY ...> directives.
  938. // Parsers are required to recognize lt, gt, amp, apos, and quot
  939. // even if they have not been declared.
  940. before := d.buf.Len()
  941. d.buf.WriteByte('&')
  942. var ok bool
  943. var text string
  944. var haveText bool
  945. if b, ok = d.mustgetc(); !ok {
  946. return nil
  947. }
  948. if b == '#' {
  949. d.buf.WriteByte(b)
  950. if b, ok = d.mustgetc(); !ok {
  951. return nil
  952. }
  953. base := 10
  954. if b == 'x' {
  955. base = 16
  956. d.buf.WriteByte(b)
  957. if b, ok = d.mustgetc(); !ok {
  958. return nil
  959. }
  960. }
  961. start := d.buf.Len()
  962. for '0' <= b && b <= '9' ||
  963. base == 16 && 'a' <= b && b <= 'f' ||
  964. base == 16 && 'A' <= b && b <= 'F' {
  965. d.buf.WriteByte(b)
  966. if b, ok = d.mustgetc(); !ok {
  967. return nil
  968. }
  969. }
  970. if b != ';' {
  971. d.ungetc(b)
  972. } else {
  973. s := string(d.buf.Bytes()[start:])
  974. d.buf.WriteByte(';')
  975. n, err := strconv.ParseUint(s, base, 64)
  976. if err == nil && n <= unicode.MaxRune {
  977. text = string(rune(n))
  978. haveText = true
  979. }
  980. }
  981. } else {
  982. d.ungetc(b)
  983. if !d.readName() {
  984. if d.err != nil {
  985. return nil
  986. }
  987. }
  988. if b, ok = d.mustgetc(); !ok {
  989. return nil
  990. }
  991. if b != ';' {
  992. d.ungetc(b)
  993. } else {
  994. name := d.buf.Bytes()[before+1:]
  995. d.buf.WriteByte(';')
  996. if isName(name) {
  997. s := string(name)
  998. if r, ok := entity[s]; ok {
  999. text = string(r)
  1000. haveText = true
  1001. } else if d.Entity != nil {
  1002. text, haveText = d.Entity[s]
  1003. }
  1004. }
  1005. }
  1006. }
  1007. if haveText {
  1008. d.buf.Truncate(before)
  1009. d.buf.Write([]byte(text))
  1010. b0, b1 = 0, 0
  1011. continue Input
  1012. }
  1013. if !d.Strict {
  1014. b0, b1 = 0, 0
  1015. continue Input
  1016. }
  1017. ent := string(d.buf.Bytes()[before:])
  1018. if ent[len(ent)-1] != ';' {
  1019. ent += " (no semicolon)"
  1020. }
  1021. d.err = d.syntaxError("invalid character entity " + ent)
  1022. return nil
  1023. }
  1024. // We must rewrite unescaped \r and \r\n into \n.
  1025. if b == '\r' {
  1026. d.buf.WriteByte('\n')
  1027. } else if b1 == '\r' && b == '\n' {
  1028. // Skip \r\n--we already wrote \n.
  1029. } else {
  1030. d.buf.WriteByte(b)
  1031. }
  1032. b0, b1 = b1, b
  1033. }
  1034. data := d.buf.Bytes()
  1035. data = data[0 : len(data)-trunc]
  1036. // Inspect each rune for being a disallowed character.
  1037. buf := data
  1038. for len(buf) > 0 {
  1039. r, size := utf8.DecodeRune(buf)
  1040. if r == utf8.RuneError && size == 1 {
  1041. d.err = d.syntaxError("invalid UTF-8")
  1042. return nil
  1043. }
  1044. buf = buf[size:]
  1045. if !isInCharacterRange(r) {
  1046. d.err = d.syntaxError(fmt.Sprintf("illegal character code %U", r))
  1047. return nil
  1048. }
  1049. }
  1050. return data
  1051. }
  1052. // Decide whether the given rune is in the XML Character Range, per
  1053. // the Char production of https://www.xml.com/axml/testaxml.htm,
  1054. // Section 2.2 Characters.
  1055. func isInCharacterRange(r rune) (inrange bool) {
  1056. return r == 0x09 ||
  1057. r == 0x0A ||
  1058. r == 0x0D ||
  1059. r >= 0x20 && r <= 0xD7FF ||
  1060. r >= 0xE000 && r <= 0xFFFD ||
  1061. r >= 0x10000 && r <= 0x10FFFF
  1062. }
  1063. // Get name space name: name with a : stuck in the middle.
  1064. // The part before the : is the name space identifier.
  1065. func (d *Decoder) nsname() (name Name, ok bool) {
  1066. s, ok := d.name()
  1067. if !ok {
  1068. return
  1069. }
  1070. i := strings.Index(s, ":")
  1071. if i < 0 {
  1072. name.Local = s
  1073. } else {
  1074. name.Space = s[0:i]
  1075. name.Local = s[i+1:]
  1076. }
  1077. return name, true
  1078. }
  1079. // Get name: /first(first|second)*/
  1080. // Do not set d.err if the name is missing (unless unexpected EOF is received):
  1081. // let the caller provide better context.
  1082. func (d *Decoder) name() (s string, ok bool) {
  1083. d.buf.Reset()
  1084. if !d.readName() {
  1085. return "", false
  1086. }
  1087. // Now we check the characters.
  1088. b := d.buf.Bytes()
  1089. if !isName(b) {
  1090. d.err = d.syntaxError("invalid XML name: " + string(b))
  1091. return "", false
  1092. }
  1093. return string(b), true
  1094. }
  1095. // Read a name and append its bytes to d.buf.
  1096. // The name is delimited by any single-byte character not valid in names.
  1097. // All multi-byte characters are accepted; the caller must check their validity.
  1098. func (d *Decoder) readName() (ok bool) {
  1099. var b byte
  1100. if b, ok = d.mustgetc(); !ok {
  1101. return
  1102. }
  1103. if b < utf8.RuneSelf && !isNameByte(b) {
  1104. d.ungetc(b)
  1105. return false
  1106. }
  1107. d.buf.WriteByte(b)
  1108. for {
  1109. if b, ok = d.mustgetc(); !ok {
  1110. return
  1111. }
  1112. if b < utf8.RuneSelf && !isNameByte(b) {
  1113. d.ungetc(b)
  1114. break
  1115. }
  1116. d.buf.WriteByte(b)
  1117. }
  1118. return true
  1119. }
  1120. func isNameByte(c byte) bool {
  1121. return 'A' <= c && c <= 'Z' ||
  1122. 'a' <= c && c <= 'z' ||
  1123. '0' <= c && c <= '9' ||
  1124. c == '_' || c == ':' || c == '.' || c == '-'
  1125. }
  1126. func isName(s []byte) bool {
  1127. if len(s) == 0 {
  1128. return false
  1129. }
  1130. c, n := utf8.DecodeRune(s)
  1131. if c == utf8.RuneError && n == 1 {
  1132. return false
  1133. }
  1134. if !unicode.Is(first, c) {
  1135. return false
  1136. }
  1137. for n < len(s) {
  1138. s = s[n:]
  1139. c, n = utf8.DecodeRune(s)
  1140. if c == utf8.RuneError && n == 1 {
  1141. return false
  1142. }
  1143. if !unicode.Is(first, c) && !unicode.Is(second, c) {
  1144. return false
  1145. }
  1146. }
  1147. return true
  1148. }
  1149. func isNameString(s string) bool {
  1150. if len(s) == 0 {
  1151. return false
  1152. }
  1153. c, n := utf8.DecodeRuneInString(s)
  1154. if c == utf8.RuneError && n == 1 {
  1155. return false
  1156. }
  1157. if !unicode.Is(first, c) {
  1158. return false
  1159. }
  1160. for n < len(s) {
  1161. s = s[n:]
  1162. c, n = utf8.DecodeRuneInString(s)
  1163. if c == utf8.RuneError && n == 1 {
  1164. return false
  1165. }
  1166. if !unicode.Is(first, c) && !unicode.Is(second, c) {
  1167. return false
  1168. }
  1169. }
  1170. return true
  1171. }
  1172. // These tables were generated by cut and paste from Appendix B of
  1173. // the XML spec at https://www.xml.com/axml/testaxml.htm
  1174. // and then reformatting. First corresponds to (Letter | '_' | ':')
  1175. // and second corresponds to NameChar.
  1176. var first = &unicode.RangeTable{
  1177. R16: []unicode.Range16{
  1178. {0x003A, 0x003A, 1},
  1179. {0x0041, 0x005A, 1},
  1180. {0x005F, 0x005F, 1},
  1181. {0x0061, 0x007A, 1},
  1182. {0x00C0, 0x00D6, 1},
  1183. {0x00D8, 0x00F6, 1},
  1184. {0x00F8, 0x00FF, 1},
  1185. {0x0100, 0x0131, 1},
  1186. {0x0134, 0x013E, 1},
  1187. {0x0141, 0x0148, 1},
  1188. {0x014A, 0x017E, 1},
  1189. {0x0180, 0x01C3, 1},
  1190. {0x01CD, 0x01F0, 1},
  1191. {0x01F4, 0x01F5, 1},
  1192. {0x01FA, 0x0217, 1},
  1193. {0x0250, 0x02A8, 1},
  1194. {0x02BB, 0x02C1, 1},
  1195. {0x0386, 0x0386, 1},
  1196. {0x0388, 0x038A, 1},
  1197. {0x038C, 0x038C, 1},
  1198. {0x038E, 0x03A1, 1},
  1199. {0x03A3, 0x03CE, 1},
  1200. {0x03D0, 0x03D6, 1},
  1201. {0x03DA, 0x03E0, 2},
  1202. {0x03E2, 0x03F3, 1},
  1203. {0x0401, 0x040C, 1},
  1204. {0x040E, 0x044F, 1},
  1205. {0x0451, 0x045C, 1},
  1206. {0x045E, 0x0481, 1},
  1207. {0x0490, 0x04C4, 1},
  1208. {0x04C7, 0x04C8, 1},
  1209. {0x04CB, 0x04CC, 1},
  1210. {0x04D0, 0x04EB, 1},
  1211. {0x04EE, 0x04F5, 1},
  1212. {0x04F8, 0x04F9, 1},
  1213. {0x0531, 0x0556, 1},
  1214. {0x0559, 0x0559, 1},
  1215. {0x0561, 0x0586, 1},
  1216. {0x05D0, 0x05EA, 1},
  1217. {0x05F0, 0x05F2, 1},
  1218. {0x0621, 0x063A, 1},
  1219. {0x0641, 0x064A, 1},
  1220. {0x0671, 0x06B7, 1},
  1221. {0x06BA, 0x06BE, 1},
  1222. {0x06C0, 0x06CE, 1},
  1223. {0x06D0, 0x06D3, 1},
  1224. {0x06D5, 0x06D5, 1},
  1225. {0x06E5, 0x06E6, 1},
  1226. {0x0905, 0x0939, 1},
  1227. {0x093D, 0x093D, 1},
  1228. {0x0958, 0x0961, 1},
  1229. {0x0985, 0x098C, 1},
  1230. {0x098F, 0x0990, 1},
  1231. {0x0993, 0x09A8, 1},
  1232. {0x09AA, 0x09B0, 1},
  1233. {0x09B2, 0x09B2, 1},
  1234. {0x09B6, 0x09B9, 1},
  1235. {0x09DC, 0x09DD, 1},
  1236. {0x09DF, 0x09E1, 1},
  1237. {0x09F0, 0x09F1, 1},
  1238. {0x0A05, 0x0A0A, 1},
  1239. {0x0A0F, 0x0A10, 1},
  1240. {0x0A13, 0x0A28, 1},
  1241. {0x0A2A, 0x0A30, 1},
  1242. {0x0A32, 0x0A33, 1},
  1243. {0x0A35, 0x0A36, 1},
  1244. {0x0A38, 0x0A39, 1},
  1245. {0x0A59, 0x0A5C, 1},
  1246. {0x0A5E, 0x0A5E, 1},
  1247. {0x0A72, 0x0A74, 1},
  1248. {0x0A85, 0x0A8B, 1},
  1249. {0x0A8D, 0x0A8D, 1},
  1250. {0x0A8F, 0x0A91, 1},
  1251. {0x0A93, 0x0AA8, 1},
  1252. {0x0AAA, 0x0AB0, 1},
  1253. {0x0AB2, 0x0AB3, 1},
  1254. {0x0AB5, 0x0AB9, 1},
  1255. {0x0ABD, 0x0AE0, 0x23},
  1256. {0x0B05, 0x0B0C, 1},
  1257. {0x0B0F, 0x0B10, 1},
  1258. {0x0B13, 0x0B28, 1},
  1259. {0x0B2A, 0x0B30, 1},
  1260. {0x0B32, 0x0B33, 1},
  1261. {0x0B36, 0x0B39, 1},
  1262. {0x0B3D, 0x0B3D, 1},
  1263. {0x0B5C, 0x0B5D, 1},
  1264. {0x0B5F, 0x0B61, 1},
  1265. {0x0B85, 0x0B8A, 1},
  1266. {0x0B8E, 0x0B90, 1},
  1267. {0x0B92, 0x0B95, 1},
  1268. {0x0B99, 0x0B9A, 1},
  1269. {0x0B9C, 0x0B9C, 1},
  1270. {0x0B9E, 0x0B9F, 1},
  1271. {0x0BA3, 0x0BA4, 1},
  1272. {0x0BA8, 0x0BAA, 1},
  1273. {0x0BAE, 0x0BB5, 1},
  1274. {0x0BB7, 0x0BB9, 1},
  1275. {0x0C05, 0x0C0C, 1},
  1276. {0x0C0E, 0x0C10, 1},
  1277. {0x0C12, 0x0C28, 1},
  1278. {0x0C2A, 0x0C33, 1},
  1279. {0x0C35, 0x0C39, 1},
  1280. {0x0C60, 0x0C61, 1},
  1281. {0x0C85, 0x0C8C, 1},
  1282. {0x0C8E, 0x0C90, 1},
  1283. {0x0C92, 0x0CA8, 1},
  1284. {0x0CAA, 0x0CB3, 1},
  1285. {0x0CB5, 0x0CB9, 1},
  1286. {0x0CDE, 0x0CDE, 1},
  1287. {0x0CE0, 0x0CE1, 1},
  1288. {0x0D05, 0x0D0C, 1},
  1289. {0x0D0E, 0x0D10, 1},
  1290. {0x0D12, 0x0D28, 1},
  1291. {0x0D2A, 0x0D39, 1},
  1292. {0x0D60, 0x0D61, 1},
  1293. {0x0E01, 0x0E2E, 1},
  1294. {0x0E30, 0x0E30, 1},
  1295. {0x0E32, 0x0E33, 1},
  1296. {0x0E40, 0x0E45, 1},
  1297. {0x0E81, 0x0E82, 1},
  1298. {0x0E84, 0x0E84, 1},
  1299. {0x0E87, 0x0E88, 1},
  1300. {0x0E8A, 0x0E8D, 3},
  1301. {0x0E94, 0x0E97, 1},
  1302. {0x0E99, 0x0E9F, 1},
  1303. {0x0EA1, 0x0EA3, 1},
  1304. {0x0EA5, 0x0EA7, 2},
  1305. {0x0EAA, 0x0EAB, 1},
  1306. {0x0EAD, 0x0EAE, 1},
  1307. {0x0EB0, 0x0EB0, 1},
  1308. {0x0EB2, 0x0EB3, 1},
  1309. {0x0EBD, 0x0EBD, 1},
  1310. {0x0EC0, 0x0EC4, 1},
  1311. {0x0F40, 0x0F47, 1},
  1312. {0x0F49, 0x0F69, 1},
  1313. {0x10A0, 0x10C5, 1},
  1314. {0x10D0, 0x10F6, 1},
  1315. {0x1100, 0x1100, 1},
  1316. {0x1102, 0x1103, 1},
  1317. {0x1105, 0x1107, 1},
  1318. {0x1109, 0x1109, 1},
  1319. {0x110B, 0x110C, 1},
  1320. {0x110E, 0x1112, 1},
  1321. {0x113C, 0x1140, 2},
  1322. {0x114C, 0x1150, 2},
  1323. {0x1154, 0x1155, 1},
  1324. {0x1159, 0x1159, 1},
  1325. {0x115F, 0x1161, 1},
  1326. {0x1163, 0x1169, 2},
  1327. {0x116D, 0x116E, 1},
  1328. {0x1172, 0x1173, 1},
  1329. {0x1175, 0x119E, 0x119E - 0x1175},
  1330. {0x11A8, 0x11AB, 0x11AB - 0x11A8},
  1331. {0x11AE, 0x11AF, 1},
  1332. {0x11B7, 0x11B8, 1},
  1333. {0x11BA, 0x11BA, 1},
  1334. {0x11BC, 0x11C2, 1},
  1335. {0x11EB, 0x11F0, 0x11F0 - 0x11EB},
  1336. {0x11F9, 0x11F9, 1},
  1337. {0x1E00, 0x1E9B, 1},
  1338. {0x1EA0, 0x1EF9, 1},
  1339. {0x1F00, 0x1F15, 1},
  1340. {0x1F18, 0x1F1D, 1},
  1341. {0x1F20, 0x1F45, 1},
  1342. {0x1F48, 0x1F4D, 1},
  1343. {0x1F50, 0x1F57, 1},
  1344. {0x1F59, 0x1F5B, 0x1F5B - 0x1F59},
  1345. {0x1F5D, 0x1F5D, 1},
  1346. {0x1F5F, 0x1F7D, 1},
  1347. {0x1F80, 0x1FB4, 1},
  1348. {0x1FB6, 0x1FBC, 1},
  1349. {0x1FBE, 0x1FBE, 1},
  1350. {0x1FC2, 0x1FC4, 1},
  1351. {0x1FC6, 0x1FCC, 1},
  1352. {0x1FD0, 0x1FD3, 1},
  1353. {0x1FD6, 0x1FDB, 1},
  1354. {0x1FE0, 0x1FEC, 1},
  1355. {0x1FF2, 0x1FF4, 1},
  1356. {0x1FF6, 0x1FFC, 1},
  1357. {0x2126, 0x2126, 1},
  1358. {0x212A, 0x212B, 1},
  1359. {0x212E, 0x212E, 1},
  1360. {0x2180, 0x2182, 1},
  1361. {0x3007, 0x3007, 1},
  1362. {0x3021, 0x3029, 1},
  1363. {0x3041, 0x3094, 1},
  1364. {0x30A1, 0x30FA, 1},
  1365. {0x3105, 0x312C, 1},
  1366. {0x4E00, 0x9FA5, 1},
  1367. {0xAC00, 0xD7A3, 1},
  1368. },
  1369. }
  1370. var second = &unicode.RangeTable{
  1371. R16: []unicode.Range16{
  1372. {0x002D, 0x002E, 1},
  1373. {0x0030, 0x0039, 1},
  1374. {0x00B7, 0x00B7, 1},
  1375. {0x02D0, 0x02D1, 1},
  1376. {0x0300, 0x0345, 1},
  1377. {0x0360, 0x0361, 1},
  1378. {0x0387, 0x0387, 1},
  1379. {0x0483, 0x0486, 1},
  1380. {0x0591, 0x05A1, 1},
  1381. {0x05A3, 0x05B9, 1},
  1382. {0x05BB, 0x05BD, 1},
  1383. {0x05BF, 0x05BF, 1},
  1384. {0x05C1, 0x05C2, 1},
  1385. {0x05C4, 0x0640, 0x0640 - 0x05C4},
  1386. {0x064B, 0x0652, 1},
  1387. {0x0660, 0x0669, 1},
  1388. {0x0670, 0x0670, 1},
  1389. {0x06D6, 0x06DC, 1},
  1390. {0x06DD, 0x06DF, 1},
  1391. {0x06E0, 0x06E4, 1},
  1392. {0x06E7, 0x06E8, 1},
  1393. {0x06EA, 0x06ED, 1},
  1394. {0x06F0, 0x06F9, 1},
  1395. {0x0901, 0x0903, 1},
  1396. {0x093C, 0x093C, 1},
  1397. {0x093E, 0x094C, 1},
  1398. {0x094D, 0x094D, 1},
  1399. {0x0951, 0x0954, 1},
  1400. {0x0962, 0x0963, 1},
  1401. {0x0966, 0x096F, 1},
  1402. {0x0981, 0x0983, 1},
  1403. {0x09BC, 0x09BC, 1},
  1404. {0x09BE, 0x09BF, 1},
  1405. {0x09C0, 0x09C4, 1},
  1406. {0x09C7, 0x09C8, 1},
  1407. {0x09CB, 0x09CD, 1},
  1408. {0x09D7, 0x09D7, 1},
  1409. {0x09E2, 0x09E3, 1},
  1410. {0x09E6, 0x09EF, 1},
  1411. {0x0A02, 0x0A3C, 0x3A},
  1412. {0x0A3E, 0x0A3F, 1},
  1413. {0x0A40, 0x0A42, 1},
  1414. {0x0A47, 0x0A48, 1},
  1415. {0x0A4B, 0x0A4D, 1},
  1416. {0x0A66, 0x0A6F, 1},
  1417. {0x0A70, 0x0A71, 1},
  1418. {0x0A81, 0x0A83, 1},
  1419. {0x0ABC, 0x0ABC, 1},
  1420. {0x0ABE, 0x0AC5, 1},
  1421. {0x0AC7, 0x0AC9, 1},
  1422. {0x0ACB, 0x0ACD, 1},
  1423. {0x0AE6, 0x0AEF, 1},
  1424. {0x0B01, 0x0B03, 1},
  1425. {0x0B3C, 0x0B3C, 1},
  1426. {0x0B3E, 0x0B43, 1},
  1427. {0x0B47, 0x0B48, 1},
  1428. {0x0B4B, 0x0B4D, 1},
  1429. {0x0B56, 0x0B57, 1},
  1430. {0x0B66, 0x0B6F, 1},
  1431. {0x0B82, 0x0B83, 1},
  1432. {0x0BBE, 0x0BC2, 1},
  1433. {0x0BC6, 0x0BC8, 1},
  1434. {0x0BCA, 0x0BCD, 1},
  1435. {0x0BD7, 0x0BD7, 1},
  1436. {0x0BE7, 0x0BEF, 1},
  1437. {0x0C01, 0x0C03, 1},
  1438. {0x0C3E, 0x0C44, 1},
  1439. {0x0C46, 0x0C48, 1},
  1440. {0x0C4A, 0x0C4D, 1},
  1441. {0x0C55, 0x0C56, 1},
  1442. {0x0C66, 0x0C6F, 1},
  1443. {0x0C82, 0x0C83, 1},
  1444. {0x0CBE, 0x0CC4, 1},
  1445. {0x0CC6, 0x0CC8, 1},
  1446. {0x0CCA, 0x0CCD, 1},
  1447. {0x0CD5, 0x0CD6, 1},
  1448. {0x0CE6, 0x0CEF, 1},
  1449. {0x0D02, 0x0D03, 1},
  1450. {0x0D3E, 0x0D43, 1},
  1451. {0x0D46, 0x0D48, 1},
  1452. {0x0D4A, 0x0D4D, 1},
  1453. {0x0D57, 0x0D57, 1},
  1454. {0x0D66, 0x0D6F, 1},
  1455. {0x0E31, 0x0E31, 1},
  1456. {0x0E34, 0x0E3A, 1},
  1457. {0x0E46, 0x0E46, 1},
  1458. {0x0E47, 0x0E4E, 1},
  1459. {0x0E50, 0x0E59, 1},
  1460. {0x0EB1, 0x0EB1, 1},
  1461. {0x0EB4, 0x0EB9, 1},
  1462. {0x0EBB, 0x0EBC, 1},
  1463. {0x0EC6, 0x0EC6, 1},
  1464. {0x0EC8, 0x0ECD, 1},
  1465. {0x0ED0, 0x0ED9, 1},
  1466. {0x0F18, 0x0F19, 1},
  1467. {0x0F20, 0x0F29, 1},
  1468. {0x0F35, 0x0F39, 2},
  1469. {0x0F3E, 0x0F3F, 1},
  1470. {0x0F71, 0x0F84, 1},
  1471. {0x0F86, 0x0F8B, 1},
  1472. {0x0F90, 0x0F95, 1},
  1473. {0x0F97, 0x0F97, 1},
  1474. {0x0F99, 0x0FAD, 1},
  1475. {0x0FB1, 0x0FB7, 1},
  1476. {0x0FB9, 0x0FB9, 1},
  1477. {0x20D0, 0x20DC, 1},
  1478. {0x20E1, 0x3005, 0x3005 - 0x20E1},
  1479. {0x302A, 0x302F, 1},
  1480. {0x3031, 0x3035, 1},
  1481. {0x3099, 0x309A, 1},
  1482. {0x309D, 0x309E, 1},
  1483. {0x30FC, 0x30FE, 1},
  1484. },
  1485. }
  1486. // HTMLEntity is an entity map containing translations for the
  1487. // standard HTML entity characters.
  1488. //
  1489. // See the Decoder.Strict and Decoder.Entity fields' documentation.
  1490. var HTMLEntity map[string]string = htmlEntity
  1491. var htmlEntity = map[string]string{
  1492. /*
  1493. hget http://www.w3.org/TR/html4/sgml/entities.html |
  1494. ssam '
  1495. ,y /\&gt;/ x/\&lt;(.|\n)+/ s/\n/ /g
  1496. ,x v/^\&lt;!ENTITY/d
  1497. ,s/\&lt;!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/ "\1": "\\u\2",/g
  1498. '
  1499. */
  1500. "nbsp": "\u00A0",
  1501. "iexcl": "\u00A1",
  1502. "cent": "\u00A2",
  1503. "pound": "\u00A3",
  1504. "curren": "\u00A4",
  1505. "yen": "\u00A5",
  1506. "brvbar": "\u00A6",
  1507. "sect": "\u00A7",
  1508. "uml": "\u00A8",
  1509. "copy": "\u00A9",
  1510. "ordf": "\u00AA",
  1511. "laquo": "\u00AB",
  1512. "not": "\u00AC",
  1513. "shy": "\u00AD",
  1514. "reg": "\u00AE",
  1515. "macr": "\u00AF",
  1516. "deg": "\u00B0",
  1517. "plusmn": "\u00B1",
  1518. "sup2": "\u00B2",
  1519. "sup3": "\u00B3",
  1520. "acute": "\u00B4",
  1521. "micro": "\u00B5",
  1522. "para": "\u00B6",
  1523. "middot": "\u00B7",
  1524. "cedil": "\u00B8",
  1525. "sup1": "\u00B9",
  1526. "ordm": "\u00BA",
  1527. "raquo": "\u00BB",
  1528. "frac14": "\u00BC",
  1529. "frac12": "\u00BD",
  1530. "frac34": "\u00BE",
  1531. "iquest": "\u00BF",
  1532. "Agrave": "\u00C0",
  1533. "Aacute": "\u00C1",
  1534. "Acirc": "\u00C2",
  1535. "Atilde": "\u00C3",
  1536. "Auml": "\u00C4",
  1537. "Aring": "\u00C5",
  1538. "AElig": "\u00C6",
  1539. "Ccedil": "\u00C7",
  1540. "Egrave": "\u00C8",
  1541. "Eacute": "\u00C9",
  1542. "Ecirc": "\u00CA",
  1543. "Euml": "\u00CB",
  1544. "Igrave": "\u00CC",
  1545. "Iacute": "\u00CD",
  1546. "Icirc": "\u00CE",
  1547. "Iuml": "\u00CF",
  1548. "ETH": "\u00D0",
  1549. "Ntilde": "\u00D1",
  1550. "Ograve": "\u00D2",
  1551. "Oacute": "\u00D3",
  1552. "Ocirc": "\u00D4",
  1553. "Otilde": "\u00D5",
  1554. "Ouml": "\u00D6",
  1555. "times": "\u00D7",
  1556. "Oslash": "\u00D8",
  1557. "Ugrave": "\u00D9",
  1558. "Uacute": "\u00DA",
  1559. "Ucirc": "\u00DB",
  1560. "Uuml": "\u00DC",
  1561. "Yacute": "\u00DD",
  1562. "THORN": "\u00DE",
  1563. "szlig": "\u00DF",
  1564. "agrave": "\u00E0",
  1565. "aacute": "\u00E1",
  1566. "acirc": "\u00E2",
  1567. "atilde": "\u00E3",
  1568. "auml": "\u00E4",
  1569. "aring": "\u00E5",
  1570. "aelig": "\u00E6",
  1571. "ccedil": "\u00E7",
  1572. "egrave": "\u00E8",
  1573. "eacute": "\u00E9",
  1574. "ecirc": "\u00EA",
  1575. "euml": "\u00EB",
  1576. "igrave": "\u00EC",
  1577. "iacute": "\u00ED",
  1578. "icirc": "\u00EE",
  1579. "iuml": "\u00EF",
  1580. "eth": "\u00F0",
  1581. "ntilde": "\u00F1",
  1582. "ograve": "\u00F2",
  1583. "oacute": "\u00F3",
  1584. "ocirc": "\u00F4",
  1585. "otilde": "\u00F5",
  1586. "ouml": "\u00F6",
  1587. "divide": "\u00F7",
  1588. "oslash": "\u00F8",
  1589. "ugrave": "\u00F9",
  1590. "uacute": "\u00FA",
  1591. "ucirc": "\u00FB",
  1592. "uuml": "\u00FC",
  1593. "yacute": "\u00FD",
  1594. "thorn": "\u00FE",
  1595. "yuml": "\u00FF",
  1596. "fnof": "\u0192",
  1597. "Alpha": "\u0391",
  1598. "Beta": "\u0392",
  1599. "Gamma": "\u0393",
  1600. "Delta": "\u0394",
  1601. "Epsilon": "\u0395",
  1602. "Zeta": "\u0396",
  1603. "Eta": "\u0397",
  1604. "Theta": "\u0398",
  1605. "Iota": "\u0399",
  1606. "Kappa": "\u039A",
  1607. "Lambda": "\u039B",
  1608. "Mu": "\u039C",
  1609. "Nu": "\u039D",
  1610. "Xi": "\u039E",
  1611. "Omicron": "\u039F",
  1612. "Pi": "\u03A0",
  1613. "Rho": "\u03A1",
  1614. "Sigma": "\u03A3",
  1615. "Tau": "\u03A4",
  1616. "Upsilon": "\u03A5",
  1617. "Phi": "\u03A6",
  1618. "Chi": "\u03A7",
  1619. "Psi": "\u03A8",
  1620. "Omega": "\u03A9",
  1621. "alpha": "\u03B1",
  1622. "beta": "\u03B2",
  1623. "gamma": "\u03B3",
  1624. "delta": "\u03B4",
  1625. "epsilon": "\u03B5",
  1626. "zeta": "\u03B6",
  1627. "eta": "\u03B7",
  1628. "theta": "\u03B8",
  1629. "iota": "\u03B9",
  1630. "kappa": "\u03BA",
  1631. "lambda": "\u03BB",
  1632. "mu": "\u03BC",
  1633. "nu": "\u03BD",
  1634. "xi": "\u03BE",
  1635. "omicron": "\u03BF",
  1636. "pi": "\u03C0",
  1637. "rho": "\u03C1",
  1638. "sigmaf": "\u03C2",
  1639. "sigma": "\u03C3",
  1640. "tau": "\u03C4",
  1641. "upsilon": "\u03C5",
  1642. "phi": "\u03C6",
  1643. "chi": "\u03C7",
  1644. "psi": "\u03C8",
  1645. "omega": "\u03C9",
  1646. "thetasym": "\u03D1",
  1647. "upsih": "\u03D2",
  1648. "piv": "\u03D6",
  1649. "bull": "\u2022",
  1650. "hellip": "\u2026",
  1651. "prime": "\u2032",
  1652. "Prime": "\u2033",
  1653. "oline": "\u203E",
  1654. "frasl": "\u2044",
  1655. "weierp": "\u2118",
  1656. "image": "\u2111",
  1657. "real": "\u211C",
  1658. "trade": "\u2122",
  1659. "alefsym": "\u2135",
  1660. "larr": "\u2190",
  1661. "uarr": "\u2191",
  1662. "rarr": "\u2192",
  1663. "darr": "\u2193",
  1664. "harr": "\u2194",
  1665. "crarr": "\u21B5",
  1666. "lArr": "\u21D0",
  1667. "uArr": "\u21D1",
  1668. "rArr": "\u21D2",
  1669. "dArr": "\u21D3",
  1670. "hArr": "\u21D4",
  1671. "forall": "\u2200",
  1672. "part": "\u2202",
  1673. "exist": "\u2203",
  1674. "empty": "\u2205",
  1675. "nabla": "\u2207",
  1676. "isin": "\u2208",
  1677. "notin": "\u2209",
  1678. "ni": "\u220B",
  1679. "prod": "\u220F",
  1680. "sum": "\u2211",
  1681. "minus": "\u2212",
  1682. "lowast": "\u2217",
  1683. "radic": "\u221A",
  1684. "prop": "\u221D",
  1685. "infin": "\u221E",
  1686. "ang": "\u2220",
  1687. "and": "\u2227",
  1688. "or": "\u2228",
  1689. "cap": "\u2229",
  1690. "cup": "\u222A",
  1691. "int": "\u222B",
  1692. "there4": "\u2234",
  1693. "sim": "\u223C",
  1694. "cong": "\u2245",
  1695. "asymp": "\u2248",
  1696. "ne": "\u2260",
  1697. "equiv": "\u2261",
  1698. "le": "\u2264",
  1699. "ge": "\u2265",
  1700. "sub": "\u2282",
  1701. "sup": "\u2283",
  1702. "nsub": "\u2284",
  1703. "sube": "\u2286",
  1704. "supe": "\u2287",
  1705. "oplus": "\u2295",
  1706. "otimes": "\u2297",
  1707. "perp": "\u22A5",
  1708. "sdot": "\u22C5",
  1709. "lceil": "\u2308",
  1710. "rceil": "\u2309",
  1711. "lfloor": "\u230A",
  1712. "rfloor": "\u230B",
  1713. "lang": "\u2329",
  1714. "rang": "\u232A",
  1715. "loz": "\u25CA",
  1716. "spades": "\u2660",
  1717. "clubs": "\u2663",
  1718. "hearts": "\u2665",
  1719. "diams": "\u2666",
  1720. "quot": "\u0022",
  1721. "amp": "\u0026",
  1722. "lt": "\u003C",
  1723. "gt": "\u003E",
  1724. "OElig": "\u0152",
  1725. "oelig": "\u0153",
  1726. "Scaron": "\u0160",
  1727. "scaron": "\u0161",
  1728. "Yuml": "\u0178",
  1729. "circ": "\u02C6",
  1730. "tilde": "\u02DC",
  1731. "ensp": "\u2002",
  1732. "emsp": "\u2003",
  1733. "thinsp": "\u2009",
  1734. "zwnj": "\u200C",
  1735. "zwj": "\u200D",
  1736. "lrm": "\u200E",
  1737. "rlm": "\u200F",
  1738. "ndash": "\u2013",
  1739. "mdash": "\u2014",
  1740. "lsquo": "\u2018",
  1741. "rsquo": "\u2019",
  1742. "sbquo": "\u201A",
  1743. "ldquo": "\u201C",
  1744. "rdquo": "\u201D",
  1745. "bdquo": "\u201E",
  1746. "dagger": "\u2020",
  1747. "Dagger": "\u2021",
  1748. "permil": "\u2030",
  1749. "lsaquo": "\u2039",
  1750. "rsaquo": "\u203A",
  1751. "euro": "\u20AC",
  1752. }
  1753. // HTMLAutoClose is the set of HTML elements that
  1754. // should be considered to close automatically.
  1755. //
  1756. // See the Decoder.Strict and Decoder.Entity fields' documentation.
  1757. var HTMLAutoClose []string = htmlAutoClose
  1758. var htmlAutoClose = []string{
  1759. /*
  1760. hget http://www.w3.org/TR/html4/loose.dtd |
  1761. 9 sed -n 's/<!ELEMENT ([^ ]*) +- O EMPTY.+/ "\1",/p' | tr A-Z a-z
  1762. */
  1763. "basefont",
  1764. "br",
  1765. "area",
  1766. "link",
  1767. "img",
  1768. "param",
  1769. "hr",
  1770. "input",
  1771. "col",
  1772. "frame",
  1773. "isindex",
  1774. "base",
  1775. "meta",
  1776. }
  1777. var (
  1778. escQuot = []byte("&#34;") // shorter than "&quot;"
  1779. escApos = []byte("&#39;") // shorter than "&apos;"
  1780. escAmp = []byte("&amp;")
  1781. escLT = []byte("&lt;")
  1782. escGT = []byte("&gt;")
  1783. escTab = []byte("&#x9;")
  1784. escNL = []byte("&#xA;")
  1785. escCR = []byte("&#xD;")
  1786. escFFFD = []byte("\uFFFD") // Unicode replacement character
  1787. )
  1788. // EscapeText writes to w the properly escaped XML equivalent
  1789. // of the plain text data s.
  1790. func EscapeText(w io.Writer, s []byte) error {
  1791. return escapeText(w, s, true)
  1792. }
  1793. // escapeText writes to w the properly escaped XML equivalent
  1794. // of the plain text data s. If escapeNewline is true, newline
  1795. // characters will be escaped.
  1796. func escapeText(w io.Writer, s []byte, escapeNewline bool) error {
  1797. var esc []byte
  1798. last := 0
  1799. for i := 0; i < len(s); {
  1800. r, width := utf8.DecodeRune(s[i:])
  1801. i += width
  1802. switch r {
  1803. case '"':
  1804. esc = escQuot
  1805. case '\'':
  1806. esc = escApos
  1807. case '&':
  1808. esc = escAmp
  1809. case '<':
  1810. esc = escLT
  1811. case '>':
  1812. esc = escGT
  1813. case '\t':
  1814. esc = escTab
  1815. case '\n':
  1816. if !escapeNewline {
  1817. continue
  1818. }
  1819. esc = escNL
  1820. case '\r':
  1821. esc = escCR
  1822. default:
  1823. if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) {
  1824. esc = escFFFD
  1825. break
  1826. }
  1827. continue
  1828. }
  1829. if _, err := w.Write(s[last : i-width]); err != nil {
  1830. return err
  1831. }
  1832. if _, err := w.Write(esc); err != nil {
  1833. return err
  1834. }
  1835. last = i
  1836. }
  1837. _, err := w.Write(s[last:])
  1838. return err
  1839. }
  1840. // EscapeString writes to p the properly escaped XML equivalent
  1841. // of the plain text data s.
  1842. func (p *printer) EscapeString(s string) {
  1843. var esc []byte
  1844. last := 0
  1845. for i := 0; i < len(s); {
  1846. r, width := utf8.DecodeRuneInString(s[i:])
  1847. i += width
  1848. switch r {
  1849. case '"':
  1850. esc = escQuot
  1851. case '\'':
  1852. esc = escApos
  1853. case '&':
  1854. esc = escAmp
  1855. case '<':
  1856. esc = escLT
  1857. case '>':
  1858. esc = escGT
  1859. case '\t':
  1860. esc = escTab
  1861. case '\n':
  1862. esc = escNL
  1863. case '\r':
  1864. esc = escCR
  1865. default:
  1866. if !isInCharacterRange(r) || (r == 0xFFFD && width == 1) {
  1867. esc = escFFFD
  1868. break
  1869. }
  1870. continue
  1871. }
  1872. p.WriteString(s[last : i-width])
  1873. p.Write(esc)
  1874. last = i
  1875. }
  1876. p.WriteString(s[last:])
  1877. }
  1878. // Escape is like EscapeText but omits the error return value.
  1879. // It is provided for backwards compatibility with Go 1.0.
  1880. // Code targeting Go 1.1 or later should use EscapeText.
  1881. func Escape(w io.Writer, s []byte) {
  1882. EscapeText(w, s)
  1883. }
  1884. var (
  1885. cdataStart = []byte("<![CDATA[")
  1886. cdataEnd = []byte("]]>")
  1887. cdataEscape = []byte("]]]]><![CDATA[>")
  1888. )
  1889. // emitCDATA writes to w the CDATA-wrapped plain text data s.
  1890. // It escapes CDATA directives nested in s.
  1891. func emitCDATA(w io.Writer, s []byte) error {
  1892. if len(s) == 0 {
  1893. return nil
  1894. }
  1895. if _, err := w.Write(cdataStart); err != nil {
  1896. return err
  1897. }
  1898. for {
  1899. i := bytes.Index(s, cdataEnd)
  1900. if i >= 0 && i+len(cdataEnd) <= len(s) {
  1901. // Found a nested CDATA directive end.
  1902. if _, err := w.Write(s[:i]); err != nil {
  1903. return err
  1904. }
  1905. if _, err := w.Write(cdataEscape); err != nil {
  1906. return err
  1907. }
  1908. i += len(cdataEnd)
  1909. } else {
  1910. if _, err := w.Write(s); err != nil {
  1911. return err
  1912. }
  1913. break
  1914. }
  1915. s = s[i:]
  1916. }
  1917. _, err := w.Write(cdataEnd)
  1918. return err
  1919. }
  1920. // procInst parses the `param="..."` or `param='...'`
  1921. // value out of the provided string, returning "" if not found.
  1922. func procInst(param, s string) string {
  1923. // TODO: this parsing is somewhat lame and not exact.
  1924. // It works for all actual cases, though.
  1925. param = param + "="
  1926. idx := strings.Index(s, param)
  1927. if idx == -1 {
  1928. return ""
  1929. }
  1930. v := s[idx+len(param):]
  1931. if v == "" {
  1932. return ""
  1933. }
  1934. if v[0] != '\'' && v[0] != '"' {
  1935. return ""
  1936. }
  1937. idx = strings.IndexRune(v[1:], rune(v[0]))
  1938. if idx == -1 {
  1939. return ""
  1940. }
  1941. return v[1 : idx+1]
  1942. }