db.go 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361
  1. package bbolt
  2. import (
  3. "errors"
  4. "fmt"
  5. "hash/fnv"
  6. "io"
  7. "os"
  8. "runtime"
  9. "sort"
  10. "sync"
  11. "time"
  12. "unsafe"
  13. )
  14. // The largest step that can be taken when remapping the mmap.
  15. const maxMmapStep = 1 << 30 // 1GB
  16. // The data file format version.
  17. const version = 2
  18. // Represents a marker value to indicate that a file is a Bolt DB.
  19. const magic uint32 = 0xED0CDAED
  20. const pgidNoFreelist pgid = 0xffffffffffffffff
  21. // IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
  22. // syncing changes to a file. This is required as some operating systems,
  23. // such as OpenBSD, do not have a unified buffer cache (UBC) and writes
  24. // must be synchronized using the msync(2) syscall.
  25. const IgnoreNoSync = runtime.GOOS == "openbsd"
  26. // Default values if not set in a DB instance.
  27. const (
  28. DefaultMaxBatchSize int = 1000
  29. DefaultMaxBatchDelay = 10 * time.Millisecond
  30. DefaultAllocSize = 16 * 1024 * 1024
  31. )
  32. // default page size for db is set to the OS page size.
  33. var defaultPageSize = os.Getpagesize()
  34. // The time elapsed between consecutive file locking attempts.
  35. const flockRetryTimeout = 50 * time.Millisecond
  36. // FreelistType is the type of the freelist backend
  37. type FreelistType string
  38. const (
  39. // FreelistArrayType indicates backend freelist type is array
  40. FreelistArrayType = FreelistType("array")
  41. // FreelistMapType indicates backend freelist type is hashmap
  42. FreelistMapType = FreelistType("hashmap")
  43. )
  44. // DB represents a collection of buckets persisted to a file on disk.
  45. // All data access is performed through transactions which can be obtained through the DB.
  46. // All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
  47. type DB struct {
  48. // When enabled, the database will perform a Check() after every commit.
  49. // A panic is issued if the database is in an inconsistent state. This
  50. // flag has a large performance impact so it should only be used for
  51. // debugging purposes.
  52. StrictMode bool
  53. // Setting the NoSync flag will cause the database to skip fsync()
  54. // calls after each commit. This can be useful when bulk loading data
  55. // into a database and you can restart the bulk load in the event of
  56. // a system failure or database corruption. Do not set this flag for
  57. // normal use.
  58. //
  59. // If the package global IgnoreNoSync constant is true, this value is
  60. // ignored. See the comment on that constant for more details.
  61. //
  62. // THIS IS UNSAFE. PLEASE USE WITH CAUTION.
  63. NoSync bool
  64. // When true, skips syncing freelist to disk. This improves the database
  65. // write performance under normal operation, but requires a full database
  66. // re-sync during recovery.
  67. NoFreelistSync bool
  68. // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
  69. // dramatic performance degradation if database is large and fragmentation in freelist is common.
  70. // The alternative one is using hashmap, it is faster in almost all circumstances
  71. // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
  72. // The default type is array
  73. FreelistType FreelistType
  74. // When true, skips the truncate call when growing the database.
  75. // Setting this to true is only safe on non-ext3/ext4 systems.
  76. // Skipping truncation avoids preallocation of hard drive space and
  77. // bypasses a truncate() and fsync() syscall on remapping.
  78. //
  79. // https://github.com/boltdb/bolt/issues/284
  80. NoGrowSync bool
  81. // When `true`, bbolt will always load the free pages when opening the DB.
  82. // When opening db in write mode, this flag will always automatically
  83. // set to `true`.
  84. PreLoadFreelist bool
  85. // If you want to read the entire database fast, you can set MmapFlag to
  86. // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
  87. MmapFlags int
  88. // MaxBatchSize is the maximum size of a batch. Default value is
  89. // copied from DefaultMaxBatchSize in Open.
  90. //
  91. // If <=0, disables batching.
  92. //
  93. // Do not change concurrently with calls to Batch.
  94. MaxBatchSize int
  95. // MaxBatchDelay is the maximum delay before a batch starts.
  96. // Default value is copied from DefaultMaxBatchDelay in Open.
  97. //
  98. // If <=0, effectively disables batching.
  99. //
  100. // Do not change concurrently with calls to Batch.
  101. MaxBatchDelay time.Duration
  102. // AllocSize is the amount of space allocated when the database
  103. // needs to create new pages. This is done to amortize the cost
  104. // of truncate() and fsync() when growing the data file.
  105. AllocSize int
  106. // Mlock locks database file in memory when set to true.
  107. // It prevents major page faults, however used memory can't be reclaimed.
  108. //
  109. // Supported only on Unix via mlock/munlock syscalls.
  110. Mlock bool
  111. path string
  112. openFile func(string, int, os.FileMode) (*os.File, error)
  113. file *os.File
  114. // `dataref` isn't used at all on Windows, and the golangci-lint
  115. // always fails on Windows platform.
  116. //nolint
  117. dataref []byte // mmap'ed readonly, write throws SEGV
  118. data *[maxMapSize]byte
  119. datasz int
  120. filesz int // current on disk file size
  121. meta0 *meta
  122. meta1 *meta
  123. pageSize int
  124. opened bool
  125. rwtx *Tx
  126. txs []*Tx
  127. stats Stats
  128. freelist *freelist
  129. freelistLoad sync.Once
  130. pagePool sync.Pool
  131. batchMu sync.Mutex
  132. batch *batch
  133. rwlock sync.Mutex // Allows only one writer at a time.
  134. metalock sync.Mutex // Protects meta page access.
  135. mmaplock sync.RWMutex // Protects mmap access during remapping.
  136. statlock sync.RWMutex // Protects stats access.
  137. ops struct {
  138. writeAt func(b []byte, off int64) (n int, err error)
  139. }
  140. // Read only mode.
  141. // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
  142. readOnly bool
  143. }
  144. // Path returns the path to currently open database file.
  145. func (db *DB) Path() string {
  146. return db.path
  147. }
  148. // GoString returns the Go string representation of the database.
  149. func (db *DB) GoString() string {
  150. return fmt.Sprintf("bolt.DB{path:%q}", db.path)
  151. }
  152. // String returns the string representation of the database.
  153. func (db *DB) String() string {
  154. return fmt.Sprintf("DB<%q>", db.path)
  155. }
  156. // Open creates and opens a database at the given path.
  157. // If the file does not exist then it will be created automatically.
  158. // Passing in nil options will cause Bolt to open the database with the default options.
  159. func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
  160. db := &DB{
  161. opened: true,
  162. }
  163. // Set default options if no options are provided.
  164. if options == nil {
  165. options = DefaultOptions
  166. }
  167. db.NoSync = options.NoSync
  168. db.NoGrowSync = options.NoGrowSync
  169. db.MmapFlags = options.MmapFlags
  170. db.NoFreelistSync = options.NoFreelistSync
  171. db.PreLoadFreelist = options.PreLoadFreelist
  172. db.FreelistType = options.FreelistType
  173. db.Mlock = options.Mlock
  174. // Set default values for later DB operations.
  175. db.MaxBatchSize = DefaultMaxBatchSize
  176. db.MaxBatchDelay = DefaultMaxBatchDelay
  177. db.AllocSize = DefaultAllocSize
  178. flag := os.O_RDWR
  179. if options.ReadOnly {
  180. flag = os.O_RDONLY
  181. db.readOnly = true
  182. } else {
  183. // always load free pages in write mode
  184. db.PreLoadFreelist = true
  185. }
  186. db.openFile = options.OpenFile
  187. if db.openFile == nil {
  188. db.openFile = os.OpenFile
  189. }
  190. // Open data file and separate sync handler for metadata writes.
  191. var err error
  192. if db.file, err = db.openFile(path, flag|os.O_CREATE, mode); err != nil {
  193. _ = db.close()
  194. return nil, err
  195. }
  196. db.path = db.file.Name()
  197. // Lock file so that other processes using Bolt in read-write mode cannot
  198. // use the database at the same time. This would cause corruption since
  199. // the two processes would write meta pages and free pages separately.
  200. // The database file is locked exclusively (only one process can grab the lock)
  201. // if !options.ReadOnly.
  202. // The database file is locked using the shared lock (more than one process may
  203. // hold a lock at the same time) otherwise (options.ReadOnly is set).
  204. if err := flock(db, !db.readOnly, options.Timeout); err != nil {
  205. _ = db.close()
  206. return nil, err
  207. }
  208. // Default values for test hooks
  209. db.ops.writeAt = db.file.WriteAt
  210. if db.pageSize = options.PageSize; db.pageSize == 0 {
  211. // Set the default page size to the OS page size.
  212. db.pageSize = defaultPageSize
  213. }
  214. // Initialize the database if it doesn't exist.
  215. if info, err := db.file.Stat(); err != nil {
  216. _ = db.close()
  217. return nil, err
  218. } else if info.Size() == 0 {
  219. // Initialize new files with meta pages.
  220. if err := db.init(); err != nil {
  221. // clean up file descriptor on initialization fail
  222. _ = db.close()
  223. return nil, err
  224. }
  225. } else {
  226. // try to get the page size from the metadata pages
  227. if pgSize, err := db.getPageSize(); err == nil {
  228. db.pageSize = pgSize
  229. } else {
  230. _ = db.close()
  231. return nil, ErrInvalid
  232. }
  233. }
  234. // Initialize page pool.
  235. db.pagePool = sync.Pool{
  236. New: func() interface{} {
  237. return make([]byte, db.pageSize)
  238. },
  239. }
  240. // Memory map the data file.
  241. if err := db.mmap(options.InitialMmapSize); err != nil {
  242. _ = db.close()
  243. return nil, err
  244. }
  245. if db.PreLoadFreelist {
  246. db.loadFreelist()
  247. }
  248. if db.readOnly {
  249. return db, nil
  250. }
  251. // Flush freelist when transitioning from no sync to sync so
  252. // NoFreelistSync unaware boltdb can open the db later.
  253. if !db.NoFreelistSync && !db.hasSyncedFreelist() {
  254. tx, err := db.Begin(true)
  255. if tx != nil {
  256. err = tx.Commit()
  257. }
  258. if err != nil {
  259. _ = db.close()
  260. return nil, err
  261. }
  262. }
  263. // Mark the database as opened and return.
  264. return db, nil
  265. }
  266. // getPageSize reads the pageSize from the meta pages. It tries
  267. // to read the first meta page firstly. If the first page is invalid,
  268. // then it tries to read the second page using the default page size.
  269. func (db *DB) getPageSize() (int, error) {
  270. var (
  271. meta0CanRead, meta1CanRead bool
  272. )
  273. // Read the first meta page to determine the page size.
  274. if pgSize, canRead, err := db.getPageSizeFromFirstMeta(); err != nil {
  275. // We cannot read the page size from page 0, but can read page 0.
  276. meta0CanRead = canRead
  277. } else {
  278. return pgSize, nil
  279. }
  280. // Read the second meta page to determine the page size.
  281. if pgSize, canRead, err := db.getPageSizeFromSecondMeta(); err != nil {
  282. // We cannot read the page size from page 1, but can read page 1.
  283. meta1CanRead = canRead
  284. } else {
  285. return pgSize, nil
  286. }
  287. // If we can't read the page size from both pages, but can read
  288. // either page, then we assume it's the same as the OS or the one
  289. // given, since that's how the page size was chosen in the first place.
  290. //
  291. // If both pages are invalid, and (this OS uses a different page size
  292. // from what the database was created with or the given page size is
  293. // different from what the database was created with), then we are out
  294. // of luck and cannot access the database.
  295. if meta0CanRead || meta1CanRead {
  296. return db.pageSize, nil
  297. }
  298. return 0, ErrInvalid
  299. }
  300. // getPageSizeFromFirstMeta reads the pageSize from the first meta page
  301. func (db *DB) getPageSizeFromFirstMeta() (int, bool, error) {
  302. var buf [0x1000]byte
  303. var metaCanRead bool
  304. if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) {
  305. metaCanRead = true
  306. if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil {
  307. return int(m.pageSize), metaCanRead, nil
  308. }
  309. }
  310. return 0, metaCanRead, ErrInvalid
  311. }
  312. // getPageSizeFromSecondMeta reads the pageSize from the second meta page
  313. func (db *DB) getPageSizeFromSecondMeta() (int, bool, error) {
  314. var (
  315. fileSize int64
  316. metaCanRead bool
  317. )
  318. // get the db file size
  319. if info, err := db.file.Stat(); err != nil {
  320. return 0, metaCanRead, err
  321. } else {
  322. fileSize = info.Size()
  323. }
  324. // We need to read the second meta page, so we should skip the first page;
  325. // but we don't know the exact page size yet, it's chicken & egg problem.
  326. // The solution is to try all the possible page sizes, which starts from 1KB
  327. // and until 16MB (1024<<14) or the end of the db file
  328. //
  329. // TODO: should we support larger page size?
  330. for i := 0; i <= 14; i++ {
  331. var buf [0x1000]byte
  332. var pos int64 = 1024 << uint(i)
  333. if pos >= fileSize-1024 {
  334. break
  335. }
  336. bw, err := db.file.ReadAt(buf[:], pos)
  337. if (err == nil && bw == len(buf)) || (err == io.EOF && int64(bw) == (fileSize-pos)) {
  338. metaCanRead = true
  339. if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil {
  340. return int(m.pageSize), metaCanRead, nil
  341. }
  342. }
  343. }
  344. return 0, metaCanRead, ErrInvalid
  345. }
  346. // loadFreelist reads the freelist if it is synced, or reconstructs it
  347. // by scanning the DB if it is not synced. It assumes there are no
  348. // concurrent accesses being made to the freelist.
  349. func (db *DB) loadFreelist() {
  350. db.freelistLoad.Do(func() {
  351. db.freelist = newFreelist(db.FreelistType)
  352. if !db.hasSyncedFreelist() {
  353. // Reconstruct free list by scanning the DB.
  354. db.freelist.readIDs(db.freepages())
  355. } else {
  356. // Read free list from freelist page.
  357. db.freelist.read(db.page(db.meta().freelist))
  358. }
  359. db.stats.FreePageN = db.freelist.free_count()
  360. })
  361. }
  362. func (db *DB) hasSyncedFreelist() bool {
  363. return db.meta().freelist != pgidNoFreelist
  364. }
  365. // mmap opens the underlying memory-mapped file and initializes the meta references.
  366. // minsz is the minimum size that the new mmap can be.
  367. func (db *DB) mmap(minsz int) error {
  368. db.mmaplock.Lock()
  369. defer db.mmaplock.Unlock()
  370. info, err := db.file.Stat()
  371. if err != nil {
  372. return fmt.Errorf("mmap stat error: %s", err)
  373. } else if int(info.Size()) < db.pageSize*2 {
  374. return fmt.Errorf("file size too small")
  375. }
  376. // Ensure the size is at least the minimum size.
  377. fileSize := int(info.Size())
  378. var size = fileSize
  379. if size < minsz {
  380. size = minsz
  381. }
  382. size, err = db.mmapSize(size)
  383. if err != nil {
  384. return err
  385. }
  386. if db.Mlock {
  387. // Unlock db memory
  388. if err := db.munlock(fileSize); err != nil {
  389. return err
  390. }
  391. }
  392. // Dereference all mmap references before unmapping.
  393. if db.rwtx != nil {
  394. db.rwtx.root.dereference()
  395. }
  396. // Unmap existing data before continuing.
  397. if err := db.munmap(); err != nil {
  398. return err
  399. }
  400. // Memory-map the data file as a byte slice.
  401. // gofail: var mapError string
  402. // return errors.New(mapError)
  403. if err := mmap(db, size); err != nil {
  404. return err
  405. }
  406. if db.Mlock {
  407. // Don't allow swapping of data file
  408. if err := db.mlock(fileSize); err != nil {
  409. return err
  410. }
  411. }
  412. // Save references to the meta pages.
  413. db.meta0 = db.page(0).meta()
  414. db.meta1 = db.page(1).meta()
  415. // Validate the meta pages. We only return an error if both meta pages fail
  416. // validation, since meta0 failing validation means that it wasn't saved
  417. // properly -- but we can recover using meta1. And vice-versa.
  418. err0 := db.meta0.validate()
  419. err1 := db.meta1.validate()
  420. if err0 != nil && err1 != nil {
  421. return err0
  422. }
  423. return nil
  424. }
  425. func (db *DB) invalidate() {
  426. db.dataref = nil
  427. db.data = nil
  428. db.datasz = 0
  429. db.meta0 = nil
  430. db.meta1 = nil
  431. }
  432. // munmap unmaps the data file from memory.
  433. func (db *DB) munmap() error {
  434. defer db.invalidate()
  435. // gofail: var unmapError string
  436. // return errors.New(unmapError)
  437. if err := munmap(db); err != nil {
  438. return fmt.Errorf("unmap error: " + err.Error())
  439. }
  440. return nil
  441. }
  442. // mmapSize determines the appropriate size for the mmap given the current size
  443. // of the database. The minimum size is 32KB and doubles until it reaches 1GB.
  444. // Returns an error if the new mmap size is greater than the max allowed.
  445. func (db *DB) mmapSize(size int) (int, error) {
  446. // Double the size from 32KB until 1GB.
  447. for i := uint(15); i <= 30; i++ {
  448. if size <= 1<<i {
  449. return 1 << i, nil
  450. }
  451. }
  452. // Verify the requested size is not above the maximum allowed.
  453. if size > maxMapSize {
  454. return 0, fmt.Errorf("mmap too large")
  455. }
  456. // If larger than 1GB then grow by 1GB at a time.
  457. sz := int64(size)
  458. if remainder := sz % int64(maxMmapStep); remainder > 0 {
  459. sz += int64(maxMmapStep) - remainder
  460. }
  461. // Ensure that the mmap size is a multiple of the page size.
  462. // This should always be true since we're incrementing in MBs.
  463. pageSize := int64(db.pageSize)
  464. if (sz % pageSize) != 0 {
  465. sz = ((sz / pageSize) + 1) * pageSize
  466. }
  467. // If we've exceeded the max size then only grow up to the max size.
  468. if sz > maxMapSize {
  469. sz = maxMapSize
  470. }
  471. return int(sz), nil
  472. }
  473. func (db *DB) munlock(fileSize int) error {
  474. if err := munlock(db, fileSize); err != nil {
  475. return fmt.Errorf("munlock error: " + err.Error())
  476. }
  477. return nil
  478. }
  479. func (db *DB) mlock(fileSize int) error {
  480. if err := mlock(db, fileSize); err != nil {
  481. return fmt.Errorf("mlock error: " + err.Error())
  482. }
  483. return nil
  484. }
  485. func (db *DB) mrelock(fileSizeFrom, fileSizeTo int) error {
  486. if err := db.munlock(fileSizeFrom); err != nil {
  487. return err
  488. }
  489. if err := db.mlock(fileSizeTo); err != nil {
  490. return err
  491. }
  492. return nil
  493. }
  494. // init creates a new database file and initializes its meta pages.
  495. func (db *DB) init() error {
  496. // Create two meta pages on a buffer.
  497. buf := make([]byte, db.pageSize*4)
  498. for i := 0; i < 2; i++ {
  499. p := db.pageInBuffer(buf, pgid(i))
  500. p.id = pgid(i)
  501. p.flags = metaPageFlag
  502. // Initialize the meta page.
  503. m := p.meta()
  504. m.magic = magic
  505. m.version = version
  506. m.pageSize = uint32(db.pageSize)
  507. m.freelist = 2
  508. m.root = bucket{root: 3}
  509. m.pgid = 4
  510. m.txid = txid(i)
  511. m.checksum = m.sum64()
  512. }
  513. // Write an empty freelist at page 3.
  514. p := db.pageInBuffer(buf, pgid(2))
  515. p.id = pgid(2)
  516. p.flags = freelistPageFlag
  517. p.count = 0
  518. // Write an empty leaf page at page 4.
  519. p = db.pageInBuffer(buf, pgid(3))
  520. p.id = pgid(3)
  521. p.flags = leafPageFlag
  522. p.count = 0
  523. // Write the buffer to our data file.
  524. if _, err := db.ops.writeAt(buf, 0); err != nil {
  525. return err
  526. }
  527. if err := fdatasync(db); err != nil {
  528. return err
  529. }
  530. db.filesz = len(buf)
  531. return nil
  532. }
  533. // Close releases all database resources.
  534. // It will block waiting for any open transactions to finish
  535. // before closing the database and returning.
  536. func (db *DB) Close() error {
  537. db.rwlock.Lock()
  538. defer db.rwlock.Unlock()
  539. db.metalock.Lock()
  540. defer db.metalock.Unlock()
  541. db.mmaplock.Lock()
  542. defer db.mmaplock.Unlock()
  543. return db.close()
  544. }
  545. func (db *DB) close() error {
  546. if !db.opened {
  547. return nil
  548. }
  549. db.opened = false
  550. db.freelist = nil
  551. // Clear ops.
  552. db.ops.writeAt = nil
  553. // Close the mmap.
  554. if err := db.munmap(); err != nil {
  555. return err
  556. }
  557. // Close file handles.
  558. if db.file != nil {
  559. // No need to unlock read-only file.
  560. if !db.readOnly {
  561. // Unlock the file.
  562. if err := funlock(db); err != nil {
  563. return fmt.Errorf("bolt.Close(): funlock error: %w", err)
  564. }
  565. }
  566. // Close the file descriptor.
  567. if err := db.file.Close(); err != nil {
  568. return fmt.Errorf("db file close: %s", err)
  569. }
  570. db.file = nil
  571. }
  572. db.path = ""
  573. return nil
  574. }
  575. // Begin starts a new transaction.
  576. // Multiple read-only transactions can be used concurrently but only one
  577. // write transaction can be used at a time. Starting multiple write transactions
  578. // will cause the calls to block and be serialized until the current write
  579. // transaction finishes.
  580. //
  581. // Transactions should not be dependent on one another. Opening a read
  582. // transaction and a write transaction in the same goroutine can cause the
  583. // writer to deadlock because the database periodically needs to re-mmap itself
  584. // as it grows and it cannot do that while a read transaction is open.
  585. //
  586. // If a long running read transaction (for example, a snapshot transaction) is
  587. // needed, you might want to set DB.InitialMmapSize to a large enough value
  588. // to avoid potential blocking of write transaction.
  589. //
  590. // IMPORTANT: You must close read-only transactions after you are finished or
  591. // else the database will not reclaim old pages.
  592. func (db *DB) Begin(writable bool) (*Tx, error) {
  593. if writable {
  594. return db.beginRWTx()
  595. }
  596. return db.beginTx()
  597. }
  598. func (db *DB) beginTx() (*Tx, error) {
  599. // Lock the meta pages while we initialize the transaction. We obtain
  600. // the meta lock before the mmap lock because that's the order that the
  601. // write transaction will obtain them.
  602. db.metalock.Lock()
  603. // Obtain a read-only lock on the mmap. When the mmap is remapped it will
  604. // obtain a write lock so all transactions must finish before it can be
  605. // remapped.
  606. db.mmaplock.RLock()
  607. // Exit if the database is not open yet.
  608. if !db.opened {
  609. db.mmaplock.RUnlock()
  610. db.metalock.Unlock()
  611. return nil, ErrDatabaseNotOpen
  612. }
  613. // Exit if the database is not correctly mapped.
  614. if db.data == nil {
  615. db.mmaplock.RUnlock()
  616. db.metalock.Unlock()
  617. return nil, ErrInvalidMapping
  618. }
  619. // Create a transaction associated with the database.
  620. t := &Tx{}
  621. t.init(db)
  622. // Keep track of transaction until it closes.
  623. db.txs = append(db.txs, t)
  624. n := len(db.txs)
  625. // Unlock the meta pages.
  626. db.metalock.Unlock()
  627. // Update the transaction stats.
  628. db.statlock.Lock()
  629. db.stats.TxN++
  630. db.stats.OpenTxN = n
  631. db.statlock.Unlock()
  632. return t, nil
  633. }
  634. func (db *DB) beginRWTx() (*Tx, error) {
  635. // If the database was opened with Options.ReadOnly, return an error.
  636. if db.readOnly {
  637. return nil, ErrDatabaseReadOnly
  638. }
  639. // Obtain writer lock. This is released by the transaction when it closes.
  640. // This enforces only one writer transaction at a time.
  641. db.rwlock.Lock()
  642. // Once we have the writer lock then we can lock the meta pages so that
  643. // we can set up the transaction.
  644. db.metalock.Lock()
  645. defer db.metalock.Unlock()
  646. // Exit if the database is not open yet.
  647. if !db.opened {
  648. db.rwlock.Unlock()
  649. return nil, ErrDatabaseNotOpen
  650. }
  651. // Exit if the database is not correctly mapped.
  652. if db.data == nil {
  653. db.rwlock.Unlock()
  654. return nil, ErrInvalidMapping
  655. }
  656. // Create a transaction associated with the database.
  657. t := &Tx{writable: true}
  658. t.init(db)
  659. db.rwtx = t
  660. db.freePages()
  661. return t, nil
  662. }
  663. // freePages releases any pages associated with closed read-only transactions.
  664. func (db *DB) freePages() {
  665. // Free all pending pages prior to earliest open transaction.
  666. sort.Sort(txsById(db.txs))
  667. minid := txid(0xFFFFFFFFFFFFFFFF)
  668. if len(db.txs) > 0 {
  669. minid = db.txs[0].meta.txid
  670. }
  671. if minid > 0 {
  672. db.freelist.release(minid - 1)
  673. }
  674. // Release unused txid extents.
  675. for _, t := range db.txs {
  676. db.freelist.releaseRange(minid, t.meta.txid-1)
  677. minid = t.meta.txid + 1
  678. }
  679. db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF))
  680. // Any page both allocated and freed in an extent is safe to release.
  681. }
  682. type txsById []*Tx
  683. func (t txsById) Len() int { return len(t) }
  684. func (t txsById) Swap(i, j int) { t[i], t[j] = t[j], t[i] }
  685. func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid }
  686. // removeTx removes a transaction from the database.
  687. func (db *DB) removeTx(tx *Tx) {
  688. // Release the read lock on the mmap.
  689. db.mmaplock.RUnlock()
  690. // Use the meta lock to restrict access to the DB object.
  691. db.metalock.Lock()
  692. // Remove the transaction.
  693. for i, t := range db.txs {
  694. if t == tx {
  695. last := len(db.txs) - 1
  696. db.txs[i] = db.txs[last]
  697. db.txs[last] = nil
  698. db.txs = db.txs[:last]
  699. break
  700. }
  701. }
  702. n := len(db.txs)
  703. // Unlock the meta pages.
  704. db.metalock.Unlock()
  705. // Merge statistics.
  706. db.statlock.Lock()
  707. db.stats.OpenTxN = n
  708. db.stats.TxStats.add(&tx.stats)
  709. db.statlock.Unlock()
  710. }
  711. // Update executes a function within the context of a read-write managed transaction.
  712. // If no error is returned from the function then the transaction is committed.
  713. // If an error is returned then the entire transaction is rolled back.
  714. // Any error that is returned from the function or returned from the commit is
  715. // returned from the Update() method.
  716. //
  717. // Attempting to manually commit or rollback within the function will cause a panic.
  718. func (db *DB) Update(fn func(*Tx) error) error {
  719. t, err := db.Begin(true)
  720. if err != nil {
  721. return err
  722. }
  723. // Make sure the transaction rolls back in the event of a panic.
  724. defer func() {
  725. if t.db != nil {
  726. t.rollback()
  727. }
  728. }()
  729. // Mark as a managed tx so that the inner function cannot manually commit.
  730. t.managed = true
  731. // If an error is returned from the function then rollback and return error.
  732. err = fn(t)
  733. t.managed = false
  734. if err != nil {
  735. _ = t.Rollback()
  736. return err
  737. }
  738. return t.Commit()
  739. }
  740. // View executes a function within the context of a managed read-only transaction.
  741. // Any error that is returned from the function is returned from the View() method.
  742. //
  743. // Attempting to manually rollback within the function will cause a panic.
  744. func (db *DB) View(fn func(*Tx) error) error {
  745. t, err := db.Begin(false)
  746. if err != nil {
  747. return err
  748. }
  749. // Make sure the transaction rolls back in the event of a panic.
  750. defer func() {
  751. if t.db != nil {
  752. t.rollback()
  753. }
  754. }()
  755. // Mark as a managed tx so that the inner function cannot manually rollback.
  756. t.managed = true
  757. // If an error is returned from the function then pass it through.
  758. err = fn(t)
  759. t.managed = false
  760. if err != nil {
  761. _ = t.Rollback()
  762. return err
  763. }
  764. return t.Rollback()
  765. }
  766. // Batch calls fn as part of a batch. It behaves similar to Update,
  767. // except:
  768. //
  769. // 1. concurrent Batch calls can be combined into a single Bolt
  770. // transaction.
  771. //
  772. // 2. the function passed to Batch may be called multiple times,
  773. // regardless of whether it returns error or not.
  774. //
  775. // This means that Batch function side effects must be idempotent and
  776. // take permanent effect only after a successful return is seen in
  777. // caller.
  778. //
  779. // The maximum batch size and delay can be adjusted with DB.MaxBatchSize
  780. // and DB.MaxBatchDelay, respectively.
  781. //
  782. // Batch is only useful when there are multiple goroutines calling it.
  783. func (db *DB) Batch(fn func(*Tx) error) error {
  784. errCh := make(chan error, 1)
  785. db.batchMu.Lock()
  786. if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) {
  787. // There is no existing batch, or the existing batch is full; start a new one.
  788. db.batch = &batch{
  789. db: db,
  790. }
  791. db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger)
  792. }
  793. db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh})
  794. if len(db.batch.calls) >= db.MaxBatchSize {
  795. // wake up batch, it's ready to run
  796. go db.batch.trigger()
  797. }
  798. db.batchMu.Unlock()
  799. err := <-errCh
  800. if err == trySolo {
  801. err = db.Update(fn)
  802. }
  803. return err
  804. }
  805. type call struct {
  806. fn func(*Tx) error
  807. err chan<- error
  808. }
  809. type batch struct {
  810. db *DB
  811. timer *time.Timer
  812. start sync.Once
  813. calls []call
  814. }
  815. // trigger runs the batch if it hasn't already been run.
  816. func (b *batch) trigger() {
  817. b.start.Do(b.run)
  818. }
  819. // run performs the transactions in the batch and communicates results
  820. // back to DB.Batch.
  821. func (b *batch) run() {
  822. b.db.batchMu.Lock()
  823. b.timer.Stop()
  824. // Make sure no new work is added to this batch, but don't break
  825. // other batches.
  826. if b.db.batch == b {
  827. b.db.batch = nil
  828. }
  829. b.db.batchMu.Unlock()
  830. retry:
  831. for len(b.calls) > 0 {
  832. var failIdx = -1
  833. err := b.db.Update(func(tx *Tx) error {
  834. for i, c := range b.calls {
  835. if err := safelyCall(c.fn, tx); err != nil {
  836. failIdx = i
  837. return err
  838. }
  839. }
  840. return nil
  841. })
  842. if failIdx >= 0 {
  843. // take the failing transaction out of the batch. it's
  844. // safe to shorten b.calls here because db.batch no longer
  845. // points to us, and we hold the mutex anyway.
  846. c := b.calls[failIdx]
  847. b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1]
  848. // tell the submitter re-run it solo, continue with the rest of the batch
  849. c.err <- trySolo
  850. continue retry
  851. }
  852. // pass success, or bolt internal errors, to all callers
  853. for _, c := range b.calls {
  854. c.err <- err
  855. }
  856. break retry
  857. }
  858. }
  859. // trySolo is a special sentinel error value used for signaling that a
  860. // transaction function should be re-run. It should never be seen by
  861. // callers.
  862. var trySolo = errors.New("batch function returned an error and should be re-run solo")
  863. type panicked struct {
  864. reason interface{}
  865. }
  866. func (p panicked) Error() string {
  867. if err, ok := p.reason.(error); ok {
  868. return err.Error()
  869. }
  870. return fmt.Sprintf("panic: %v", p.reason)
  871. }
  872. func safelyCall(fn func(*Tx) error, tx *Tx) (err error) {
  873. defer func() {
  874. if p := recover(); p != nil {
  875. err = panicked{p}
  876. }
  877. }()
  878. return fn(tx)
  879. }
  880. // Sync executes fdatasync() against the database file handle.
  881. //
  882. // This is not necessary under normal operation, however, if you use NoSync
  883. // then it allows you to force the database file to sync against the disk.
  884. func (db *DB) Sync() error { return fdatasync(db) }
  885. // Stats retrieves ongoing performance stats for the database.
  886. // This is only updated when a transaction closes.
  887. func (db *DB) Stats() Stats {
  888. db.statlock.RLock()
  889. defer db.statlock.RUnlock()
  890. return db.stats
  891. }
  892. // This is for internal access to the raw data bytes from the C cursor, use
  893. // carefully, or not at all.
  894. func (db *DB) Info() *Info {
  895. _assert(db.data != nil, "database file isn't correctly mapped")
  896. return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
  897. }
  898. // page retrieves a page reference from the mmap based on the current page size.
  899. func (db *DB) page(id pgid) *page {
  900. pos := id * pgid(db.pageSize)
  901. return (*page)(unsafe.Pointer(&db.data[pos]))
  902. }
  903. // pageInBuffer retrieves a page reference from a given byte array based on the current page size.
  904. func (db *DB) pageInBuffer(b []byte, id pgid) *page {
  905. return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
  906. }
  907. // meta retrieves the current meta page reference.
  908. func (db *DB) meta() *meta {
  909. // We have to return the meta with the highest txid which doesn't fail
  910. // validation. Otherwise, we can cause errors when in fact the database is
  911. // in a consistent state. metaA is the one with the higher txid.
  912. metaA := db.meta0
  913. metaB := db.meta1
  914. if db.meta1.txid > db.meta0.txid {
  915. metaA = db.meta1
  916. metaB = db.meta0
  917. }
  918. // Use higher meta page if valid. Otherwise, fallback to previous, if valid.
  919. if err := metaA.validate(); err == nil {
  920. return metaA
  921. } else if err := metaB.validate(); err == nil {
  922. return metaB
  923. }
  924. // This should never be reached, because both meta1 and meta0 were validated
  925. // on mmap() and we do fsync() on every write.
  926. panic("bolt.DB.meta(): invalid meta pages")
  927. }
  928. // allocate returns a contiguous block of memory starting at a given page.
  929. func (db *DB) allocate(txid txid, count int) (*page, error) {
  930. // Allocate a temporary buffer for the page.
  931. var buf []byte
  932. if count == 1 {
  933. buf = db.pagePool.Get().([]byte)
  934. } else {
  935. buf = make([]byte, count*db.pageSize)
  936. }
  937. p := (*page)(unsafe.Pointer(&buf[0]))
  938. p.overflow = uint32(count - 1)
  939. // Use pages from the freelist if they are available.
  940. if p.id = db.freelist.allocate(txid, count); p.id != 0 {
  941. return p, nil
  942. }
  943. // Resize mmap() if we're at the end.
  944. p.id = db.rwtx.meta.pgid
  945. var minsz = int((p.id+pgid(count))+1) * db.pageSize
  946. if minsz >= db.datasz {
  947. if err := db.mmap(minsz); err != nil {
  948. return nil, fmt.Errorf("mmap allocate error: %s", err)
  949. }
  950. }
  951. // Move the page id high water mark.
  952. db.rwtx.meta.pgid += pgid(count)
  953. return p, nil
  954. }
  955. // grow grows the size of the database to the given sz.
  956. func (db *DB) grow(sz int) error {
  957. // Ignore if the new size is less than available file size.
  958. if sz <= db.filesz {
  959. return nil
  960. }
  961. // If the data is smaller than the alloc size then only allocate what's needed.
  962. // Once it goes over the allocation size then allocate in chunks.
  963. if db.datasz <= db.AllocSize {
  964. sz = db.datasz
  965. } else {
  966. sz += db.AllocSize
  967. }
  968. // Truncate and fsync to ensure file size metadata is flushed.
  969. // https://github.com/boltdb/bolt/issues/284
  970. if !db.NoGrowSync && !db.readOnly {
  971. if runtime.GOOS != "windows" {
  972. if err := db.file.Truncate(int64(sz)); err != nil {
  973. return fmt.Errorf("file resize error: %s", err)
  974. }
  975. }
  976. if err := db.file.Sync(); err != nil {
  977. return fmt.Errorf("file sync error: %s", err)
  978. }
  979. if db.Mlock {
  980. // unlock old file and lock new one
  981. if err := db.mrelock(db.filesz, sz); err != nil {
  982. return fmt.Errorf("mlock/munlock error: %s", err)
  983. }
  984. }
  985. }
  986. db.filesz = sz
  987. return nil
  988. }
  989. func (db *DB) IsReadOnly() bool {
  990. return db.readOnly
  991. }
  992. func (db *DB) freepages() []pgid {
  993. tx, err := db.beginTx()
  994. defer func() {
  995. err = tx.Rollback()
  996. if err != nil {
  997. panic("freepages: failed to rollback tx")
  998. }
  999. }()
  1000. if err != nil {
  1001. panic("freepages: failed to open read only tx")
  1002. }
  1003. reachable := make(map[pgid]*page)
  1004. nofreed := make(map[pgid]bool)
  1005. ech := make(chan error)
  1006. go func() {
  1007. for e := range ech {
  1008. panic(fmt.Sprintf("freepages: failed to get all reachable pages (%v)", e))
  1009. }
  1010. }()
  1011. tx.checkBucket(&tx.root, reachable, nofreed, HexKVStringer(), ech)
  1012. close(ech)
  1013. // TODO: If check bucket reported any corruptions (ech) we shouldn't proceed to freeing the pages.
  1014. var fids []pgid
  1015. for i := pgid(2); i < db.meta().pgid; i++ {
  1016. if _, ok := reachable[i]; !ok {
  1017. fids = append(fids, i)
  1018. }
  1019. }
  1020. return fids
  1021. }
  1022. // Options represents the options that can be set when opening a database.
  1023. type Options struct {
  1024. // Timeout is the amount of time to wait to obtain a file lock.
  1025. // When set to zero it will wait indefinitely. This option is only
  1026. // available on Darwin and Linux.
  1027. Timeout time.Duration
  1028. // Sets the DB.NoGrowSync flag before memory mapping the file.
  1029. NoGrowSync bool
  1030. // Do not sync freelist to disk. This improves the database write performance
  1031. // under normal operation, but requires a full database re-sync during recovery.
  1032. NoFreelistSync bool
  1033. // PreLoadFreelist sets whether to load the free pages when opening
  1034. // the db file. Note when opening db in write mode, bbolt will always
  1035. // load the free pages.
  1036. PreLoadFreelist bool
  1037. // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
  1038. // dramatic performance degradation if database is large and fragmentation in freelist is common.
  1039. // The alternative one is using hashmap, it is faster in almost all circumstances
  1040. // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
  1041. // The default type is array
  1042. FreelistType FreelistType
  1043. // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
  1044. // grab a shared lock (UNIX).
  1045. ReadOnly bool
  1046. // Sets the DB.MmapFlags flag before memory mapping the file.
  1047. MmapFlags int
  1048. // InitialMmapSize is the initial mmap size of the database
  1049. // in bytes. Read transactions won't block write transaction
  1050. // if the InitialMmapSize is large enough to hold database mmap
  1051. // size. (See DB.Begin for more information)
  1052. //
  1053. // If <=0, the initial map size is 0.
  1054. // If initialMmapSize is smaller than the previous database size,
  1055. // it takes no effect.
  1056. InitialMmapSize int
  1057. // PageSize overrides the default OS page size.
  1058. PageSize int
  1059. // NoSync sets the initial value of DB.NoSync. Normally this can just be
  1060. // set directly on the DB itself when returned from Open(), but this option
  1061. // is useful in APIs which expose Options but not the underlying DB.
  1062. NoSync bool
  1063. // OpenFile is used to open files. It defaults to os.OpenFile. This option
  1064. // is useful for writing hermetic tests.
  1065. OpenFile func(string, int, os.FileMode) (*os.File, error)
  1066. // Mlock locks database file in memory when set to true.
  1067. // It prevents potential page faults, however
  1068. // used memory can't be reclaimed. (UNIX only)
  1069. Mlock bool
  1070. }
  1071. // DefaultOptions represent the options used if nil options are passed into Open().
  1072. // No timeout is used which will cause Bolt to wait indefinitely for a lock.
  1073. var DefaultOptions = &Options{
  1074. Timeout: 0,
  1075. NoGrowSync: false,
  1076. FreelistType: FreelistArrayType,
  1077. }
  1078. // Stats represents statistics about the database.
  1079. type Stats struct {
  1080. // Freelist stats
  1081. FreePageN int // total number of free pages on the freelist
  1082. PendingPageN int // total number of pending pages on the freelist
  1083. FreeAlloc int // total bytes allocated in free pages
  1084. FreelistInuse int // total bytes used by the freelist
  1085. // Transaction stats
  1086. TxN int // total number of started read transactions
  1087. OpenTxN int // number of currently open read transactions
  1088. TxStats TxStats // global, ongoing stats.
  1089. }
  1090. // Sub calculates and returns the difference between two sets of database stats.
  1091. // This is useful when obtaining stats at two different points and time and
  1092. // you need the performance counters that occurred within that time span.
  1093. func (s *Stats) Sub(other *Stats) Stats {
  1094. if other == nil {
  1095. return *s
  1096. }
  1097. var diff Stats
  1098. diff.FreePageN = s.FreePageN
  1099. diff.PendingPageN = s.PendingPageN
  1100. diff.FreeAlloc = s.FreeAlloc
  1101. diff.FreelistInuse = s.FreelistInuse
  1102. diff.TxN = s.TxN - other.TxN
  1103. diff.TxStats = s.TxStats.Sub(&other.TxStats)
  1104. return diff
  1105. }
  1106. type Info struct {
  1107. Data uintptr
  1108. PageSize int
  1109. }
  1110. type meta struct {
  1111. magic uint32
  1112. version uint32
  1113. pageSize uint32
  1114. flags uint32
  1115. root bucket
  1116. freelist pgid
  1117. pgid pgid
  1118. txid txid
  1119. checksum uint64
  1120. }
  1121. // validate checks the marker bytes and version of the meta page to ensure it matches this binary.
  1122. func (m *meta) validate() error {
  1123. if m.magic != magic {
  1124. return ErrInvalid
  1125. } else if m.version != version {
  1126. return ErrVersionMismatch
  1127. } else if m.checksum != m.sum64() {
  1128. return ErrChecksum
  1129. }
  1130. return nil
  1131. }
  1132. // copy copies one meta object to another.
  1133. func (m *meta) copy(dest *meta) {
  1134. *dest = *m
  1135. }
  1136. // write writes the meta onto a page.
  1137. func (m *meta) write(p *page) {
  1138. if m.root.root >= m.pgid {
  1139. panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
  1140. } else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist {
  1141. // TODO: reject pgidNoFreeList if !NoFreelistSync
  1142. panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
  1143. }
  1144. // Page id is either going to be 0 or 1 which we can determine by the transaction ID.
  1145. p.id = pgid(m.txid % 2)
  1146. p.flags |= metaPageFlag
  1147. // Calculate the checksum.
  1148. m.checksum = m.sum64()
  1149. m.copy(p.meta())
  1150. }
  1151. // generates the checksum for the meta.
  1152. func (m *meta) sum64() uint64 {
  1153. var h = fnv.New64a()
  1154. _, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
  1155. return h.Sum64()
  1156. }
  1157. // _assert will panic with a given formatted message if the given condition is false.
  1158. func _assert(condition bool, msg string, v ...interface{}) {
  1159. if !condition {
  1160. panic(fmt.Sprintf("assertion failed: "+msg, v...))
  1161. }
  1162. }