repos / line-indexed-cursor.hs.git


commit
526b2fe
parent
166cd97
author
Evgenii Akentev
date
2023-07-21 07:17:31 +0400 +04
Add benchmark + array based implementation
4 files changed,  +271, -72
A bench/Bench.hs
+54, -0
 1@@ -0,0 +1,54 @@
 2+module Main (main) where
 3+
 4+import Criterion.Main
 5+import System.IO
 6+import System.Random (randomRs, mkStdGen)
 7+
 8+import System.IO.LineIndexedCursor
 9+
10+main :: IO ()
11+main = do
12+  -- benchmarked against http://mattmahoney.net/dc/enwik9.zip
13+  h <- openFile "test/enwik9" ReadMode
14+  c <- mkLineIndexedCursor h
15+  defaultMain
16+    [ bgroup "LineIndexedCursor"
17+      [ bench "" (nfIO $ goToLinesBench c)
18+      ]
19+    ]
20+
21+goToLinesBench :: LineIndexedCursor -> IO ()
22+goToLinesBench c = do
23+  mapM_ (goToLine c) randomOffsets
24+
25+randomOffsets :: [Integer]
26+randomOffsets = take 100 $
27+  randomRs (0, 13147025) (mkStdGen 343)
28+{-# NOINLINE randomOffsets #-}
29+
30+
31+{-
32+Array based results
33+
34+benchmarking LineIndexedCursor/
35+time                 115.8 μs   (115.3 μs .. 116.4 μs)
36+                     1.000 R²   (1.000 R² .. 1.000 R²)
37+mean                 116.4 μs   (115.9 μs .. 117.0 μs)
38+std dev              1.782 μs   (1.510 μs .. 2.271 μs)
39+
40+Benchmark bench: FINISH
41+-}
42+
43+
44+{-
45+List based results
46+
47+benchmarking LineIndexedCursor/
48+time                 15.13 s    (9.490 s .. 22.09 s)
49+                     0.976 R²   (0.926 R² .. 1.000 R²)
50+mean                 20.05 s    (17.70 s .. 23.47 s)
51+std dev              3.576 s    (624.5 ms .. 4.685 s)
52+variance introduced by outliers: 47% (moderately inflated)
53+
54+Benchmark bench: FINISH
55+-}
M line-indexed-cursor.cabal
+17, -1
 1@@ -19,7 +19,7 @@ common warnings
 2 library
 3     import:           warnings
 4     exposed-modules:  System.IO.LineIndexedCursor
 5-    build-depends:    base ^>=4.18.0.0, bytestring ^>= 0.11
 6+    build-depends:    base >= 4.7 && < 5, bytestring ^>= 0.11, array
 7     hs-source-dirs:   src
 8     default-language: Haskell2010
 9 
10@@ -33,3 +33,19 @@ test-suite line-indexed-cursor-test
11         base >= 4.7 && < 5,
12         hspec >= 2.10 && < 3,
13         line-indexed-cursor
14+
15+benchmark bench
16+    import:           warnings
17+    ghc-options:
18+        -threaded
19+        -rtsopts
20+        -with-rtsopts=-N
21+    default-language: Haskell2010
22+    type:             exitcode-stdio-1.0
23+    hs-source-dirs:   bench
24+    main-is: Bench.hs
25+    build-depends:
26+        base >= 4.7 && < 5,
27+        criterion,
28+        random,
29+        line-indexed-cursor
M src/System/IO/LineIndexedCursor.hs
+111, -17
  1@@ -13,13 +13,17 @@
  2 -----------------------------------------------------------------------------
  3 
  4 module System.IO.LineIndexedCursor (
  5-  LineIndexedCursor(..), mkLineIndexedCursor
  6+  LineIndexedCursor(..), mkLineIndexedCursor, mkLineIndexedCursorWithCapacity
  7   ) where
  8 
  9+import qualified Data.Array as A
 10 import Data.ByteString (ByteString, hGetLine)
 11 import Control.Concurrent.MVar
 12 import System.IO (Handle, hTell, hSeek, SeekMode(..), hIsEOF)
 13 
 14+defaultListCapacity :: Integer
 15+defaultListCapacity = 16384
 16+
 17 -- | ADT with methods, hiding the internal state.
 18 data LineIndexedCursor = LineIndexedCursor
 19   {
 20@@ -29,6 +33,9 @@ data LineIndexedCursor = LineIndexedCursor
 21   -- | Returns current line number.
 22   , getCurrentLineNumber :: IO Integer
 23 
 24+  -- | Reads from the latest line from index until EOF to build the full index.
 25+  , doFullScan :: IO ()
 26+
 27   -- | Rewinds to the requsted line number. Stops at EOF if it's too big.
 28   -- Returns the reached line number.
 29   , goToLine :: Integer -> IO Integer
 30@@ -39,9 +46,20 @@ data LineIndexedCursor = LineIndexedCursor
 31 
 32 data CursorHandle = CursorHandle
 33   { fileHandle :: Handle
 34-  , linesIdx :: MVar ([Integer], Integer, Integer)
 35+  , cursorState :: MVar CursorState
 36+  , listCapacity :: Integer
 37+  }
 38+
 39+data CursorState = CursorState
 40+  { cursorLinesIdx :: ![Integer]
 41+  , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer))
 42+  , cursorIdxSize :: !Integer
 43+  , cursorCurrentLineNumber :: !Integer
 44   }
 45 
 46+mElems :: (Maybe (A.Array Integer Integer)) -> [Integer]
 47+mElems = maybe [] A.elems
 48+
 49 {- |
 50 
 51 Builds 'LineIndexedCursor'.
 52@@ -52,16 +70,20 @@ Use 'System.IO.hSetNewlineMode' if you want to configure 'System.IO.NewlineMode'
 53 
 54 -}
 55 mkLineIndexedCursor :: Handle -> IO LineIndexedCursor
 56-mkLineIndexedCursor fileHandle = do
 57+mkLineIndexedCursor = flip mkLineIndexedCursorWithCapacity defaultListCapacity
 58+
 59+mkLineIndexedCursorWithCapacity :: Handle -> Integer -> IO LineIndexedCursor
 60+mkLineIndexedCursorWithCapacity fileHandle listCapacity = do
 61   -- reset the handle's offset to the beginning
 62   hSeek fileHandle AbsoluteSeek 0
 63 
 64-  linesIdx <- newMVar ([0], 0, 0)
 65+  cursorState <- newMVar $ CursorState [0] Nothing 0 0
 66 
 67-  let cursorHandle = CursorHandle fileHandle linesIdx
 68+  let cursorHandle = CursorHandle fileHandle cursorState listCapacity
 69   pure $ LineIndexedCursor
 70     { getCurrentLine = getCurrentLine' cursorHandle
 71     , getCurrentLineNumber = getCurrentLineNumber' cursorHandle
 72+    , doFullScan = doFullScan' cursorHandle
 73     , goToLine = goToLine' cursorHandle
 74     , getHandle = fileHandle
 75     }
 76@@ -71,35 +93,107 @@ getCurrentLine' CursorHandle{..} =
 77   hIsEOF fileHandle >>= \isEOF -> if isEOF then pure Nothing else do
 78     line <- hGetLine fileHandle
 79     offset <- hTell fileHandle
 80-    modifyMVar_ linesIdx $ \(idx, size, cln) -> pure $
 81+
 82+    modifyMVar_ cursorState $ \(CursorState idx arr size cln) -> pure $
 83       if (not $ offset `elem` idx)
 84-      then (offset : idx, size + 1, cln + 1)
 85-      else (idx, size, cln + 1)
 86+      then
 87+        let
 88+          (newIdx, newArr) =
 89+            if length (offset : idx) > fromIntegral listCapacity
 90+            then
 91+              let res = (offset : idx) ++ mElems arr
 92+              in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
 93+            else (offset : idx, arr)
 94+        in CursorState
 95+        { cursorLinesIdx = newIdx
 96+        , cursorLinesArrIdx = newArr
 97+        , cursorIdxSize = size + 1
 98+        , cursorCurrentLineNumber = cln + 1
 99+        }
100+      else CursorState
101+        { cursorLinesIdx = idx
102+        , cursorLinesArrIdx = arr
103+        , cursorIdxSize = size
104+        , cursorCurrentLineNumber = cln + 1
105+        }
106     pure $ Just line
107 
108+doFullScan' :: CursorHandle -> IO ()
109+doFullScan' CursorHandle{..} = do
110+  modifyMVar_ cursorState $ \cs@(CursorState idx arr size _) -> do
111+    -- go to the end of the index
112+    hSeek fileHandle AbsoluteSeek (getFirst cs)
113+    -- try to read until the EOF
114+    idxTail <- readUntilEOF []
115+    let
116+      newSize = size + (fromIntegral $ length idxTail)
117+      newState = CursorState
118+          { cursorLinesIdx = idxTail ++ idx
119+          , cursorLinesArrIdx = arr
120+          , cursorIdxSize = newSize
121+          , cursorCurrentLineNumber = newSize
122+          }
123+    pure newState
124+  where
125+    readUntilEOF idx =
126+      hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do
127+        _ <- hGetLine fileHandle
128+        offset <- hTell fileHandle
129+        readUntilEOF (fromInteger offset : idx)
130+
131 getCurrentLineNumber' :: CursorHandle -> IO Integer
132 getCurrentLineNumber' CursorHandle{..} = do
133-  (_, _, cln) <- readMVar linesIdx
134+  CursorState _ _ _ cln <- readMVar cursorState
135   pure cln
136 
137 goToLine' :: CursorHandle -> Integer -> IO Integer
138 goToLine' ch@CursorHandle{..} ln =
139   if (ln < 0) then getCurrentLineNumber' ch
140-  else modifyMVar linesIdx $ \(idx, size, _) -> do
141+  else modifyMVar cursorState $ \cs@(CursorState idx arr size _) -> do
142     if ln > size then do
143-      hSeek fileHandle AbsoluteSeek (idx !! 0)
144+      hSeek fileHandle AbsoluteSeek (getFirst cs)
145       -- try to read until the requested line number
146       idxTail <- readUntil (ln - size) []
147-      let newSize = size + (fromIntegral $ length idxTail)
148-      pure ((idxTail ++ idx, newSize, newSize), newSize)
149+      let
150+        newSize = size + (fromIntegral $ length idxTail)
151+        (newIdx, newArr) =
152+            if newSize > listCapacity
153+            then
154+              let res = (idxTail ++ idx) ++ mElems arr
155+              in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
156+            else (idxTail ++ idx, arr)
157+        newState = CursorState
158+          { cursorLinesIdx = newIdx
159+          , cursorLinesArrIdx = newArr
160+          , cursorIdxSize = newSize
161+          , cursorCurrentLineNumber = newSize
162+          }
163+      pure (newState, newSize)
164     else do
165-      let nextSeekIndex = fromIntegral $ size - ln
166-      hSeek fileHandle AbsoluteSeek (idx !! nextSeekIndex)
167-      pure ((idx, size, ln), ln)
168+      let nextSeekIndex = size - ln
169+
170+      if nextSeekIndex >= fromIntegral (length idx)
171+      then case arr of
172+        Just a -> hSeek fileHandle AbsoluteSeek (a A.! (nextSeekIndex - fromIntegral (length idx)))
173+        Nothing -> error "goToLine: there is no array"
174+      else hSeek fileHandle AbsoluteSeek (idx !! fromIntegral nextSeekIndex)
175+
176+      let
177+        newState = CursorState
178+          { cursorLinesIdx = idx
179+          , cursorLinesArrIdx = arr
180+          , cursorIdxSize = size
181+          , cursorCurrentLineNumber = ln
182+          }
183+      pure (newState, ln)
184   where
185     readUntil 0 idx = pure idx
186     readUntil counter idx =
187       hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do
188         _ <- hGetLine fileHandle
189         offset <- hTell fileHandle
190-        readUntil (counter - 1) (fromInteger offset : idx)
191+        readUntil (counter - 1) (fromInteger offset : idx)
192+
193+getFirst :: CursorState -> Integer
194+getFirst (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0
195+getFirst (CursorState idx Nothing _ _) = idx !! 0
M test/Main.hs
+89, -54
  1@@ -4,87 +4,122 @@ module Main (main) where
  2 
  3 import System.IO
  4 import System.IO.LineIndexedCursor
  5+import Data.Foldable
  6 
  7 import Test.Hspec
  8 
  9 main :: IO ()
 10 main = hspec $ do
 11   let
 12-    mkCursor = do
 13+    mkCursor capacity = do
 14       h <- openFile "test/testdata" ReadMode
 15-      c <- mkLineIndexedCursor h
 16+      c <- mkLineIndexedCursorWithCapacity h capacity
 17       pure (h, c)
 18 
 19-  before mkCursor . after (\(h, _) -> hClose h)
 20-  $ describe "System.IO.LineIndexedCursor" $ do
 21+  forM_ [0 :: Integer ..20] $ \capacity -> do
 22+    before (mkCursor capacity) . after (\(h, _) -> hClose h)
 23+    $ describe ("System.IO.LineIndexedCursor with list capacity " ++ show capacity) $ do
 24 
 25-      it "getCurrentLine works" $ \(_, c) -> do
 26-        l <- getCurrentLine c
 27-        l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
 28+        it "getCurrentLine works" $ \(_, c) -> do
 29+          l <- getCurrentLine c
 30+          l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
 31 
 32-      it "goToLine works" $ \(_, c) -> do
 33-        ln <- goToLine c 10
 34-        ln `shouldBe` 10
 35+          ln <- goToLine c 3
 36+          ln `shouldBe` 3
 37 
 38-        l <- getCurrentLine c
 39-        l `shouldBe` Just "Sed elementum velit sit amet orci mollis tincidunt."
 40+          l' <- getCurrentLine c
 41+          l' `shouldBe` Just "Curabitur nec mi sit amet justo condimentum gravida."
 42 
 43-      it "goToLine is negative" $ \(_, c) -> do
 44-        ln <- goToLine c (-10)
 45-        ln `shouldBe` 0
 46+          l'' <- getCurrentLine c
 47+          l'' `shouldBe` Just "Pellentesque accumsan dolor at nisl pulvinar, ut bibendum diam egestas."
 48 
 49-        l <- getCurrentLine c
 50-        l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
 51+          ln' <- goToLine c 3
 52+          ln' `shouldBe` 3
 53 
 54-      it "goToLine is too big" $ \(_, c) -> do
 55-        l <- getCurrentLine c
 56-        l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
 57+          ln'' <- goToLine c 2
 58+          ln'' `shouldBe` 2
 59 
 60-        ln <- goToLine c 30
 61-        ln `shouldBe` 20
 62+          ln''' <- goToLine c 1
 63+          ln''' `shouldBe` 1
 64 
 65-        l' <- getCurrentLine c
 66-        l' `shouldBe` Nothing
 67+          ln'''' <- goToLine c 0
 68+          ln'''' `shouldBe` 0
 69 
 70-      it "read line, then go to beginning and forth" $ \(_, c) -> do
 71-        cln <- getCurrentLineNumber c
 72-        cln `shouldBe` 0
 73+        it "goToLine works" $ \(_, c) -> do
 74+          ln <- goToLine c 10
 75+          ln `shouldBe` 10
 76 
 77-        l <- getCurrentLine c
 78-        l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
 79+          l <- getCurrentLine c
 80+          l `shouldBe` Just "Sed elementum velit sit amet orci mollis tincidunt."
 81 
 82-        cln' <- getCurrentLineNumber c
 83-        cln' `shouldBe` 1
 84+        it "goToLine is negative" $ \(_, c) -> do
 85+          ln <- goToLine c (-10)
 86+          ln `shouldBe` 0
 87 
 88-        _ <- getCurrentLine c
 89-        _ <- getCurrentLine c
 90-        _ <- getCurrentLine c
 91-        _ <- getCurrentLine c
 92-        _ <- getCurrentLine c
 93+          l <- getCurrentLine c
 94+          l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
 95 
 96-        cln'' <- getCurrentLineNumber c
 97-        cln'' `shouldBe` 6
 98+        it "goToLine is too big" $ \(_, c) -> do
 99+          l <- getCurrentLine c
100+          l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
101 
102-        ln <- goToLine c 0
103-        ln `shouldBe` 0
104+          ln <- goToLine c 30
105+          ln `shouldBe` 20
106 
107-        cln''' <- getCurrentLineNumber c
108-        cln''' `shouldBe` 0
109+          l' <- getCurrentLine c
110+          l' `shouldBe` Nothing
111 
112-        l' <- getCurrentLine c
113-        l' `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
114+        it "fullScan works" $ \(_, c) -> do
115+          l <- getCurrentLine c
116+          l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
117 
118-        ln' <- goToLine c 5
119-        ln' `shouldBe` 5
120+          doFullScan c
121 
122-        l'' <- getCurrentLine c
123-        l'' `shouldBe` Just "Curabitur nec dui posuere, tincidunt turpis vitae, tincidunt magna."
124+          cln <- getCurrentLineNumber c
125+          cln `shouldBe` 20
126 
127-        ln'' <- goToLine c 6
128-        ln'' `shouldBe` 6
129+          l' <- getCurrentLine c
130+          l' `shouldBe` Nothing
131 
132-        ln''' <- goToLine c 7
133-        ln''' `shouldBe` 7
134+        it "read line, then go to beginning and forth" $ \(_, c) -> do
135+          cln <- getCurrentLineNumber c
136+          cln `shouldBe` 0
137 
138-        ln'''' <- goToLine c 10
139-        ln'''' `shouldBe` 10
140+          l <- getCurrentLine c
141+          l `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
142+
143+          cln' <- getCurrentLineNumber c
144+          cln' `shouldBe` 1
145+
146+          _ <- getCurrentLine c
147+          _ <- getCurrentLine c
148+          _ <- getCurrentLine c
149+          _ <- getCurrentLine c
150+          _ <- getCurrentLine c
151+
152+          cln'' <- getCurrentLineNumber c
153+          cln'' `shouldBe` 6
154+
155+          ln <- goToLine c 0
156+          ln `shouldBe` 0
157+
158+          cln''' <- getCurrentLineNumber c
159+          cln''' `shouldBe` 0
160+
161+          l' <- getCurrentLine c
162+          l' `shouldBe` Just "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
163+
164+          ln' <- goToLine c 5
165+          ln' `shouldBe` 5
166+
167+          l'' <- getCurrentLine c
168+          l'' `shouldBe` Just "Curabitur nec dui posuere, tincidunt turpis vitae, tincidunt magna."
169+
170+          ln'' <- goToLine c 6
171+          ln'' `shouldBe` 6
172+
173+          ln''' <- goToLine c 7
174+          ln''' `shouldBe` 7
175+
176+          ln'''' <- goToLine c 10
177+          ln'''' `shouldBe` 10