- commit
- 526b2fe
- parent
- 166cd97
- author
- Evgenii Akentev
- date
- 2023-07-21 07:17:31 +0400 +04
Add benchmark + array based implementation
4 files changed,
+271,
-72
+54,
-0
1@@ -0,0 +1,54 @@
2+module Main (main) where
3+
4+import Criterion.Main
5+import System.IO
6+import System.Random (randomRs, mkStdGen)
7+
8+import System.IO.LineIndexedCursor
9+
10+main :: IO ()
11+main = do
12+ -- benchmarked against http://mattmahoney.net/dc/enwik9.zip
13+ h <- openFile "test/enwik9" ReadMode
14+ c <- mkLineIndexedCursor h
15+ defaultMain
16+ [ bgroup "LineIndexedCursor"
17+ [ bench "" (nfIO $ goToLinesBench c)
18+ ]
19+ ]
20+
21+goToLinesBench :: LineIndexedCursor -> IO ()
22+goToLinesBench c = do
23+ mapM_ (goToLine c) randomOffsets
24+
25+randomOffsets :: [Integer]
26+randomOffsets = take 100 $
27+ randomRs (0, 13147025) (mkStdGen 343)
28+{-# NOINLINE randomOffsets #-}
29+
30+
31+{-
32+Array based results
33+
34+benchmarking LineIndexedCursor/
35+time 115.8 μs (115.3 μs .. 116.4 μs)
36+ 1.000 R² (1.000 R² .. 1.000 R²)
37+mean 116.4 μs (115.9 μs .. 117.0 μs)
38+std dev 1.782 μs (1.510 μs .. 2.271 μs)
39+
40+Benchmark bench: FINISH
41+-}
42+
43+
44+{-
45+List based results
46+
47+benchmarking LineIndexedCursor/
48+time 15.13 s (9.490 s .. 22.09 s)
49+ 0.976 R² (0.926 R² .. 1.000 R²)
50+mean 20.05 s (17.70 s .. 23.47 s)
51+std dev 3.576 s (624.5 ms .. 4.685 s)
52+variance introduced by outliers: 47% (moderately inflated)
53+
54+Benchmark bench: FINISH
55+-}
+17,
-1
1@@ -19,7 +19,7 @@ common warnings
2 library
3 import: warnings
4 exposed-modules: System.IO.LineIndexedCursor
5- build-depends: base ^>=4.18.0.0, bytestring ^>= 0.11
6+ build-depends: base >= 4.7 && < 5, bytestring ^>= 0.11, array
7 hs-source-dirs: src
8 default-language: Haskell2010
9
10@@ -33,3 +33,19 @@ test-suite line-indexed-cursor-test
11 base >= 4.7 && < 5,
12 hspec >= 2.10 && < 3,
13 line-indexed-cursor
14+
15+benchmark bench
16+ import: warnings
17+ ghc-options:
18+ -threaded
19+ -rtsopts
20+ -with-rtsopts=-N
21+ default-language: Haskell2010
22+ type: exitcode-stdio-1.0
23+ hs-source-dirs: bench
24+ main-is: Bench.hs
25+ build-depends:
26+ base >= 4.7 && < 5,
27+ criterion,
28+ random,
29+ line-indexed-cursor
+111,
-17
1@@ -13,13 +13,17 @@
2 -----------------------------------------------------------------------------
3
4 module System.IO.LineIndexedCursor (
5- LineIndexedCursor(..), mkLineIndexedCursor
6+ LineIndexedCursor(..), mkLineIndexedCursor, mkLineIndexedCursorWithCapacity
7 ) where
8
9+import qualified Data.Array as A
10 import Data.ByteString (ByteString, hGetLine)
11 import Control.Concurrent.MVar
12 import System.IO (Handle, hTell, hSeek, SeekMode(..), hIsEOF)
13
14+defaultListCapacity :: Integer
15+defaultListCapacity = 16384
16+
17 -- | ADT with methods, hiding the internal state.
18 data LineIndexedCursor = LineIndexedCursor
19 {
20@@ -29,6 +33,9 @@ data LineIndexedCursor = LineIndexedCursor
21 -- | Returns current line number.
22 , getCurrentLineNumber :: IO Integer
23
24+ -- | Reads from the latest line from index until EOF to build the full index.
25+ , doFullScan :: IO ()
26+
27 -- | Rewinds to the requsted line number. Stops at EOF if it's too big.
28 -- Returns the reached line number.
29 , goToLine :: Integer -> IO Integer
30@@ -39,9 +46,20 @@ data LineIndexedCursor = LineIndexedCursor
31
32 data CursorHandle = CursorHandle
33 { fileHandle :: Handle
34- , linesIdx :: MVar ([Integer], Integer, Integer)
35+ , cursorState :: MVar CursorState
36+ , listCapacity :: Integer
37+ }
38+
39+data CursorState = CursorState
40+ { cursorLinesIdx :: ![Integer]
41+ , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer))
42+ , cursorIdxSize :: !Integer
43+ , cursorCurrentLineNumber :: !Integer
44 }
45
46+mElems :: (Maybe (A.Array Integer Integer)) -> [Integer]
47+mElems = maybe [] A.elems
48+
49 {- |
50
51 Builds 'LineIndexedCursor'.
52@@ -52,16 +70,20 @@ Use 'System.IO.hSetNewlineMode' if you want to configure 'System.IO.NewlineMode'
53
54 -}
55 mkLineIndexedCursor :: Handle -> IO LineIndexedCursor
56-mkLineIndexedCursor fileHandle = do
57+mkLineIndexedCursor = flip mkLineIndexedCursorWithCapacity defaultListCapacity
58+
59+mkLineIndexedCursorWithCapacity :: Handle -> Integer -> IO LineIndexedCursor
60+mkLineIndexedCursorWithCapacity fileHandle listCapacity = do
61 -- reset the handle's offset to the beginning
62 hSeek fileHandle AbsoluteSeek 0
63
64- linesIdx <- newMVar ([0], 0, 0)
65+ cursorState <- newMVar $ CursorState [0] Nothing 0 0
66
67- let cursorHandle = CursorHandle fileHandle linesIdx
68+ let cursorHandle = CursorHandle fileHandle cursorState listCapacity
69 pure $ LineIndexedCursor
70 { getCurrentLine = getCurrentLine' cursorHandle
71 , getCurrentLineNumber = getCurrentLineNumber' cursorHandle
72+ , doFullScan = doFullScan' cursorHandle
73 , goToLine = goToLine' cursorHandle
74 , getHandle = fileHandle
75 }
76@@ -71,35 +93,107 @@ getCurrentLine' CursorHandle{..} =
77 hIsEOF fileHandle >>= \isEOF -> if isEOF then pure Nothing else do
78 line <- hGetLine fileHandle
79 offset <- hTell fileHandle
80- modifyMVar_ linesIdx $ \(idx, size, cln) -> pure $
81+
82+ modifyMVar_ cursorState $ \(CursorState idx arr size cln) -> pure $
83 if (not $ offset `elem` idx)
84- then (offset : idx, size + 1, cln + 1)
85- else (idx, size, cln + 1)
86+ then
87+ let
88+ (newIdx, newArr) =
89+ if length (offset : idx) > fromIntegral listCapacity
90+ then
91+ let res = (offset : idx) ++ mElems arr
92+ in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
93+ else (offset : idx, arr)
94+ in CursorState
95+ { cursorLinesIdx = newIdx
96+ , cursorLinesArrIdx = newArr
97+ , cursorIdxSize = size + 1
98+ , cursorCurrentLineNumber = cln + 1
99+ }
100+ else CursorState
101+ { cursorLinesIdx = idx
102+ , cursorLinesArrIdx = arr
103+ , cursorIdxSize = size
104+ , cursorCurrentLineNumber = cln + 1
105+ }
106 pure $ Just line
107
108+doFullScan' :: CursorHandle -> IO ()
109+doFullScan' CursorHandle{..} = do
110+ modifyMVar_ cursorState $ \cs@(CursorState idx arr size _) -> do
111+ -- go to the end of the index
112+ hSeek fileHandle AbsoluteSeek (getFirst cs)
113+ -- try to read until the EOF
114+ idxTail <- readUntilEOF []
115+ let
116+ newSize = size + (fromIntegral $ length idxTail)
117+ newState = CursorState
118+ { cursorLinesIdx = idxTail ++ idx
119+ , cursorLinesArrIdx = arr
120+ , cursorIdxSize = newSize
121+ , cursorCurrentLineNumber = newSize
122+ }
123+ pure newState
124+ where
125+ readUntilEOF idx =
126+ hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do
127+ _ <- hGetLine fileHandle
128+ offset <- hTell fileHandle
129+ readUntilEOF (fromInteger offset : idx)
130+
131 getCurrentLineNumber' :: CursorHandle -> IO Integer
132 getCurrentLineNumber' CursorHandle{..} = do
133- (_, _, cln) <- readMVar linesIdx
134+ CursorState _ _ _ cln <- readMVar cursorState
135 pure cln
136
137 goToLine' :: CursorHandle -> Integer -> IO Integer
138 goToLine' ch@CursorHandle{..} ln =
139 if (ln < 0) then getCurrentLineNumber' ch
140- else modifyMVar linesIdx $ \(idx, size, _) -> do
141+ else modifyMVar cursorState $ \cs@(CursorState idx arr size _) -> do
142 if ln > size then do
143- hSeek fileHandle AbsoluteSeek (idx !! 0)
144+ hSeek fileHandle AbsoluteSeek (getFirst cs)
145 -- try to read until the requested line number
146 idxTail <- readUntil (ln - size) []
147- let newSize = size + (fromIntegral $ length idxTail)
148- pure ((idxTail ++ idx, newSize, newSize), newSize)
149+ let
150+ newSize = size + (fromIntegral $ length idxTail)
151+ (newIdx, newArr) =
152+ if newSize > listCapacity
153+ then
154+ let res = (idxTail ++ idx) ++ mElems arr
155+ in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
156+ else (idxTail ++ idx, arr)
157+ newState = CursorState
158+ { cursorLinesIdx = newIdx
159+ , cursorLinesArrIdx = newArr
160+ , cursorIdxSize = newSize
161+ , cursorCurrentLineNumber = newSize
162+ }
163+ pure (newState, newSize)
164 else do
165- let nextSeekIndex = fromIntegral $ size - ln
166- hSeek fileHandle AbsoluteSeek (idx !! nextSeekIndex)
167- pure ((idx, size, ln), ln)
168+ let nextSeekIndex = size - ln
169+
170+ if nextSeekIndex >= fromIntegral (length idx)
171+ then case arr of
172+ Just a -> hSeek fileHandle AbsoluteSeek (a A.! (nextSeekIndex - fromIntegral (length idx)))
173+ Nothing -> error "goToLine: there is no array"
174+ else hSeek fileHandle AbsoluteSeek (idx !! fromIntegral nextSeekIndex)
175+
176+ let
177+ newState = CursorState
178+ { cursorLinesIdx = idx
179+ , cursorLinesArrIdx = arr
180+ , cursorIdxSize = size
181+ , cursorCurrentLineNumber = ln
182+ }
183+ pure (newState, ln)
184 where
185 readUntil 0 idx = pure idx
186 readUntil counter idx =
187 hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do
188 _ <- hGetLine fileHandle
189 offset <- hTell fileHandle
190- readUntil (counter - 1) (fromInteger offset : idx)
191+ readUntil (counter - 1) (fromInteger offset : idx)
192+
193+getFirst :: CursorState -> Integer
194+getFirst (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0
195+getFirst (CursorState idx Nothing _ _) = idx !! 0
+89,
-54
1@@ -4,87 +4,122 @@ module Main (main) where
2
3 import System.IO
4 import System.IO.LineIndexedCursor
5+import Data.Foldable
6
7 import Test.Hspec
8
9 main :: IO ()
10 main = hspec $ do
11 let
12- mkCursor = do
13+ mkCursor capacity = do
14 h <- openFile "test/testdata" ReadMode
15- c <- mkLineIndexedCursor h
16+ c <- mkLineIndexedCursorWithCapacity h capacity
17 pure (h, c)
18
19- before mkCursor . after (\(h, _) -> hClose h)
20- $ describe "System.IO.LineIndexedCursor" $ do
21+ forM_ [0 :: Integer ..20] $ \capacity -> do
22+ before (mkCursor capacity) . after (\(h, _) -> hClose h)
23+ $ describe ("System.IO.LineIndexedCursor with list capacity " ++ show capacity) $ do
24
25- it "getCurrentLine works" $ \(_, c) -> do
26- l <- getCurrentLine c
27- l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
28+ it "getCurrentLine works" $ \(_, c) -> do
29+ l <- getCurrentLine c
30+ l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
31
32- it "goToLine works" $ \(_, c) -> do
33- ln <- goToLine c 10
34- ln `shouldBe` 10
35+ ln <- goToLine c 3
36+ ln `shouldBe` 3
37
38- l <- getCurrentLine c
39- l `shouldBe` Just "Sed elementum velit sit amet orci mollis tincidunt."
40+ l' <- getCurrentLine c
41+ l' `shouldBe` Just "Curabitur nec mi sit amet justo condimentum gravida."
42
43- it "goToLine is negative" $ \(_, c) -> do
44- ln <- goToLine c (-10)
45- ln `shouldBe` 0
46+ l'' <- getCurrentLine c
47+ l'' `shouldBe` Just "Pellentesque accumsan dolor at nisl pulvinar, ut bibendum diam egestas."
48
49- l <- getCurrentLine c
50- l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
51+ ln' <- goToLine c 3
52+ ln' `shouldBe` 3
53
54- it "goToLine is too big" $ \(_, c) -> do
55- l <- getCurrentLine c
56- l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
57+ ln'' <- goToLine c 2
58+ ln'' `shouldBe` 2
59
60- ln <- goToLine c 30
61- ln `shouldBe` 20
62+ ln''' <- goToLine c 1
63+ ln''' `shouldBe` 1
64
65- l' <- getCurrentLine c
66- l' `shouldBe` Nothing
67+ ln'''' <- goToLine c 0
68+ ln'''' `shouldBe` 0
69
70- it "read line, then go to beginning and forth" $ \(_, c) -> do
71- cln <- getCurrentLineNumber c
72- cln `shouldBe` 0
73+ it "goToLine works" $ \(_, c) -> do
74+ ln <- goToLine c 10
75+ ln `shouldBe` 10
76
77- l <- getCurrentLine c
78- l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
79+ l <- getCurrentLine c
80+ l `shouldBe` Just "Sed elementum velit sit amet orci mollis tincidunt."
81
82- cln' <- getCurrentLineNumber c
83- cln' `shouldBe` 1
84+ it "goToLine is negative" $ \(_, c) -> do
85+ ln <- goToLine c (-10)
86+ ln `shouldBe` 0
87
88- _ <- getCurrentLine c
89- _ <- getCurrentLine c
90- _ <- getCurrentLine c
91- _ <- getCurrentLine c
92- _ <- getCurrentLine c
93+ l <- getCurrentLine c
94+ l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
95
96- cln'' <- getCurrentLineNumber c
97- cln'' `shouldBe` 6
98+ it "goToLine is too big" $ \(_, c) -> do
99+ l <- getCurrentLine c
100+ l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
101
102- ln <- goToLine c 0
103- ln `shouldBe` 0
104+ ln <- goToLine c 30
105+ ln `shouldBe` 20
106
107- cln''' <- getCurrentLineNumber c
108- cln''' `shouldBe` 0
109+ l' <- getCurrentLine c
110+ l' `shouldBe` Nothing
111
112- l' <- getCurrentLine c
113- l' `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
114+ it "fullScan works" $ \(_, c) -> do
115+ l <- getCurrentLine c
116+ l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
117
118- ln' <- goToLine c 5
119- ln' `shouldBe` 5
120+ doFullScan c
121
122- l'' <- getCurrentLine c
123- l'' `shouldBe` Just "Curabitur nec dui posuere, tincidunt turpis vitae, tincidunt magna."
124+ cln <- getCurrentLineNumber c
125+ cln `shouldBe` 20
126
127- ln'' <- goToLine c 6
128- ln'' `shouldBe` 6
129+ l' <- getCurrentLine c
130+ l' `shouldBe` Nothing
131
132- ln''' <- goToLine c 7
133- ln''' `shouldBe` 7
134+ it "read line, then go to beginning and forth" $ \(_, c) -> do
135+ cln <- getCurrentLineNumber c
136+ cln `shouldBe` 0
137
138- ln'''' <- goToLine c 10
139- ln'''' `shouldBe` 10
140+ l <- getCurrentLine c
141+ l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
142+
143+ cln' <- getCurrentLineNumber c
144+ cln' `shouldBe` 1
145+
146+ _ <- getCurrentLine c
147+ _ <- getCurrentLine c
148+ _ <- getCurrentLine c
149+ _ <- getCurrentLine c
150+ _ <- getCurrentLine c
151+
152+ cln'' <- getCurrentLineNumber c
153+ cln'' `shouldBe` 6
154+
155+ ln <- goToLine c 0
156+ ln `shouldBe` 0
157+
158+ cln''' <- getCurrentLineNumber c
159+ cln''' `shouldBe` 0
160+
161+ l' <- getCurrentLine c
162+ l' `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
163+
164+ ln' <- goToLine c 5
165+ ln' `shouldBe` 5
166+
167+ l'' <- getCurrentLine c
168+ l'' `shouldBe` Just "Curabitur nec dui posuere, tincidunt turpis vitae, tincidunt magna."
169+
170+ ln'' <- goToLine c 6
171+ ln'' `shouldBe` 6
172+
173+ ln''' <- goToLine c 7
174+ ln''' `shouldBe` 7
175+
176+ ln'''' <- goToLine c 10
177+ ln'''' `shouldBe` 10