decoder.go 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. package xml2json
  2. import (
  3. "encoding/xml"
  4. "io"
  5. "unicode"
  6. "golang.org/x/net/html/charset"
  7. )
  8. const (
  9. attrPrefix = "-"
  10. contentPrefix = "#"
  11. )
  12. // A Decoder reads and decodes XML objects from an input stream.
  13. type Decoder struct {
  14. r io.Reader
  15. err error
  16. attributePrefix string
  17. contentPrefix string
  18. excludeAttrs map[string]bool
  19. formatters []nodeFormatter
  20. }
  21. type element struct {
  22. parent *element
  23. n *Node
  24. label string
  25. }
  26. func (dec *Decoder) SetAttributePrefix(prefix string) {
  27. dec.attributePrefix = prefix
  28. }
  29. func (dec *Decoder) SetContentPrefix(prefix string) {
  30. dec.contentPrefix = prefix
  31. }
  32. func (dec *Decoder) AddFormatters(formatters []nodeFormatter) {
  33. dec.formatters = formatters
  34. }
  35. func (dec *Decoder) ExcludeAttributes(attrs []string) {
  36. for _, attr := range attrs {
  37. dec.excludeAttrs[attr] = true
  38. }
  39. }
  40. func (dec *Decoder) DecodeWithCustomPrefixes(root *Node, contentPrefix string, attributePrefix string) error {
  41. dec.contentPrefix = contentPrefix
  42. dec.attributePrefix = attributePrefix
  43. return dec.Decode(root)
  44. }
  45. // NewDecoder returns a new decoder that reads from r.
  46. func NewDecoder(r io.Reader, plugins ...plugin) *Decoder {
  47. d := &Decoder{r: r, contentPrefix: contentPrefix, attributePrefix: attrPrefix, excludeAttrs: map[string]bool{}}
  48. for _, p := range plugins {
  49. d = p.AddToDecoder(d)
  50. }
  51. return d
  52. }
  53. // Decode reads the next JSON-encoded value from its
  54. // input and stores it in the value pointed to by v.
  55. func (dec *Decoder) Decode(root *Node) error {
  56. xmlDec := xml.NewDecoder(dec.r)
  57. // That will convert the charset if the provided XML is non-UTF-8
  58. xmlDec.CharsetReader = charset.NewReaderLabel
  59. // Create first element from the root node
  60. elem := &element{
  61. parent: nil,
  62. n: root,
  63. }
  64. for {
  65. t, _ := xmlDec.Token()
  66. if t == nil {
  67. break
  68. }
  69. switch se := t.(type) {
  70. case xml.StartElement:
  71. // Build new a new current element and link it to its parent
  72. elem = &element{
  73. parent: elem,
  74. n: &Node{},
  75. label: se.Name.Local,
  76. }
  77. // Extract attributes as children
  78. for _, a := range se.Attr {
  79. if _, ok := dec.excludeAttrs[a.Name.Local]; ok {
  80. continue
  81. }
  82. elem.n.AddChild(dec.attributePrefix+a.Name.Local, &Node{Data: a.Value})
  83. }
  84. case xml.CharData:
  85. // Extract XML data (if any)
  86. elem.n.Data = trimNonGraphic(string(xml.CharData(se)))
  87. case xml.EndElement:
  88. // And add it to its parent list
  89. if elem.parent != nil {
  90. elem.parent.n.AddChild(elem.label, elem.n)
  91. }
  92. // Then change the current element to its parent
  93. elem = elem.parent
  94. }
  95. }
  96. for _, formatter := range dec.formatters {
  97. formatter.Format(root)
  98. }
  99. return nil
  100. }
  101. // trimNonGraphic returns a slice of the string s, with all leading and trailing
  102. // non graphic characters and spaces removed.
  103. //
  104. // Graphic characters include letters, marks, numbers, punctuation, symbols,
  105. // and spaces, from categories L, M, N, P, S, Zs.
  106. // Spacing characters are set by category Z and property Pattern_White_Space.
  107. func trimNonGraphic(s string) string {
  108. if s == "" {
  109. return s
  110. }
  111. var first *int
  112. var last int
  113. for i, r := range []rune(s) {
  114. if !unicode.IsGraphic(r) || unicode.IsSpace(r) {
  115. continue
  116. }
  117. if first == nil {
  118. f := i // copy i
  119. first = &f
  120. last = i
  121. } else {
  122. last = i
  123. }
  124. }
  125. // If first is nil, it means there are no graphic characters
  126. if first == nil {
  127. return ""
  128. }
  129. return string([]rune(s)[*first : last+1])
  130. }