repos / line-indexed-cursor.hs.git


commit
0e109fc
parent
315fdf5
author
Evgenii Akentev
date
2023-07-23 01:11:56 +0400 +04
Add: comments, 'getCurrentLineUnsafe', 'getCursorState'.
Refactor a bit.
2 files changed,  +81, -52
M src/System/IO/LineIndexedCursor.hs
+78, -52
  1@@ -8,14 +8,17 @@
  2 --
  3 -- Line-indexed file reader.
  4 --
  5--- Lazily builds the index of line numbers while reading the file
  6+-- Lazily builds the index with the line numbers while reading the file
  7 -- making it possible to rewind to them quickly later.
  8 -----------------------------------------------------------------------------
  9 
 10-module System.IO.LineIndexedCursor (
 11-  LineIndexedCursor(..), mkLineIndexedCursor, mkLineIndexedCursorWithCapacity
 12+module System.IO.LineIndexedCursor
 13+  ( LineIndexedCursor(..)
 14+  , mkLineIndexedCursor
 15+  , mkLineIndexedCursorWithCapacity
 16   ) where
 17 
 18+import Data.Maybe (fromMaybe)
 19 import qualified Data.Array as A
 20 import Data.ByteString (ByteString, hGetLine)
 21 import Control.Concurrent.MVar
 22@@ -25,23 +28,32 @@ defaultListCapacity :: Integer
 23 defaultListCapacity = 16384
 24 
 25 -- | ADT with methods, hiding the internal state.
 26+--
 27+-- 'LineIndexedCursor.getCurrentLine', 'LineIndexedCursor.getCurrentLineUnsafe',
 28+-- 'LineIndexedCursor.doFullScan', and 'LineIndexedCursor.goToLine', all throw 'System.IO.IOError'.
 29 data LineIndexedCursor = LineIndexedCursor
 30   {
 31-  -- | Same as 'hGetLine' but safe.
 32+  -- | Same as 'LineIndexedCursor.getCurrentLineUnsafe' but safely handles 'System.IO.EOF'.
 33   getCurrentLine :: IO (Maybe ByteString)
 34 
 35-  -- | Returns current line number.
 36+  -- | A wrapper around 'hGetLine'. Throws the same exceptions.
 37+  , getCurrentLineUnsafe :: IO ByteString
 38+
 39+  -- | Returns the current line number.
 40   , getCurrentLineNumber :: IO Integer
 41 
 42-  -- | Reads from the latest line from index until EOF to build the full index.
 43+  -- | Reads from the latest known line until EOF to build the full index.
 44   , doFullScan :: IO ()
 45 
 46-  -- | Rewinds to the requsted line number. Stops at EOF if it's too big.
 47-  -- Returns the reached line number.
 48+  -- | Rewinds the file handle to the requsted line number. Stops at the EOF if it's too big,
 49+  -- returning the reached line number.
 50   , goToLine :: Integer -> IO Integer
 51 
 52   -- | Returns the file 'Handle'.
 53   , getHandle :: Handle
 54+
 55+  -- | Returns the current state of the cursor — all known line indexes.
 56+  , getCursorState :: IO [Integer]
 57   }
 58 
 59 data CursorHandle = CursorHandle
 60@@ -52,19 +64,14 @@ data CursorHandle = CursorHandle
 61 
 62 data CursorState = CursorState
 63   { cursorLinesIdx :: ![Integer]
 64-  , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer))
 65+  , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer)) -- uses Maybe since can't be empty
 66   , cursorIdxSize :: !Integer
 67   , cursorCurrentLineNumber :: !Integer
 68   }
 69 
 70-mElems :: (Maybe (A.Array Integer Integer)) -> [Integer]
 71-mElems = maybe [] A.elems
 72-
 73 {- |
 74 
 75-Builds 'LineIndexedCursor'.
 76-
 77-Resets the file handle's ofsset to the beginning.
 78+Builds 'LineIndexedCursor'. Resets the file handle's ofsset to the beginning.
 79 
 80 Use 'System.IO.hSetNewlineMode' if you want to configure 'System.IO.NewlineMode'.
 81 
 82@@ -72,6 +79,7 @@ Use 'System.IO.hSetNewlineMode' if you want to configure 'System.IO.NewlineMode'
 83 mkLineIndexedCursor :: Handle -> IO LineIndexedCursor
 84 mkLineIndexedCursor = flip mkLineIndexedCursorWithCapacity defaultListCapacity
 85 
 86+-- | Same as 'mkLineIndexedCursor' but allows to configure the list's capacity.
 87 mkLineIndexedCursorWithCapacity :: Handle -> Integer -> IO LineIndexedCursor
 88 mkLineIndexedCursorWithCapacity fileHandle listCapacity = do
 89   -- reset the handle's offset to the beginning
 90@@ -82,10 +90,12 @@ mkLineIndexedCursorWithCapacity fileHandle listCapacity = do
 91   let cursorHandle = CursorHandle fileHandle cursorState listCapacity
 92   pure $ LineIndexedCursor
 93     { getCurrentLine = getCurrentLine' cursorHandle
 94+    , getCurrentLineUnsafe = getCurrentLineUnsafe' cursorHandle
 95     , getCurrentLineNumber = getCurrentLineNumber' cursorHandle
 96     , doFullScan = doFullScan' cursorHandle
 97     , goToLine = goToLine' cursorHandle
 98     , getHandle = fileHandle
 99+    , getCursorState = getCursorState' cursorHandle
100     }
101 
102 getCurrentLine' :: CursorHandle -> IO (Maybe ByteString)
103@@ -94,35 +104,42 @@ getCurrentLine' CursorHandle{..} =
104     line <- hGetLine fileHandle
105     offset <- hTell fileHandle
106 
107-    modifyMVar_ cursorState $ \(CursorState idx arr size cln) -> pure $
108-      if (not $ offset `elem` idx)
109-      then
110-        let
111-          (newIdx, newArr) =
112-            if length (offset : idx) > fromIntegral listCapacity
113-            then
114-              let res = (offset : idx) ++ mElems arr
115-              in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
116-            else (offset : idx, arr)
117-        in CursorState
118-        { cursorLinesIdx = newIdx
119-        , cursorLinesArrIdx = newArr
120-        , cursorIdxSize = size + 1
121-        , cursorCurrentLineNumber = cln + 1
122-        }
123-      else CursorState
124-        { cursorLinesIdx = idx
125-        , cursorLinesArrIdx = arr
126-        , cursorIdxSize = size
127-        , cursorCurrentLineNumber = cln + 1
128-        }
129+    modifyMVar_ cursorState $ \cs@(CursorState idx arr size cln) ->
130+      let latestIdx = getLatestIdx cs
131+      in pure $
132+        if (offset <= latestIdx)
133+        -- we already know this offset, so just increment the current line number
134+        then cs { cursorCurrentLineNumber = cln + 1 }
135+        -- otherwise we need to add the offset
136+        else
137+          let
138+            (newIdx, newArr) =
139+              -- if we have exceed the list capacity
140+              if length (offset : idx) > fromIntegral listCapacity
141+              -- move the list content to the array and empty the list
142+              then
143+                let res = (offset : idx) ++ maybe [] A.elems arr
144+                in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
145+              -- otherwise keep the offset in the list
146+              else (offset : idx, arr)
147+          in CursorState
148+          { cursorLinesIdx = newIdx
149+          , cursorLinesArrIdx = newArr
150+          , cursorIdxSize = size + 1
151+          , cursorCurrentLineNumber = cln + 1
152+          }
153     pure $ Just line
154 
155+getCurrentLineUnsafe' :: CursorHandle -> IO ByteString
156+getCurrentLineUnsafe' ch = do
157+  cl <- getCurrentLine' ch
158+  pure $ fromMaybe (error "getCurrentLineUnsafe: couldn't get the current line") cl
159+
160 doFullScan' :: CursorHandle -> IO ()
161 doFullScan' CursorHandle{..} = do
162   modifyMVar_ cursorState $ \cs@(CursorState idx arr size _) -> do
163     -- go to the end of the index
164-    hSeek fileHandle AbsoluteSeek (getFirst cs)
165+    hSeek fileHandle AbsoluteSeek (getLatestIdx cs)
166     -- try to read until the EOF
167     idxTail <- readUntilEOF []
168     let
169@@ -148,19 +165,25 @@ getCurrentLineNumber' CursorHandle{..} = do
170 
171 goToLine' :: CursorHandle -> Integer -> IO Integer
172 goToLine' ch@CursorHandle{..} ln =
173+  -- handle negative input
174   if (ln < 0) then getCurrentLineNumber' ch
175   else modifyMVar cursorState $ \cs@(CursorState idx arr size _) -> do
176+    -- if the requested line number is out of the index's scope
177     if ln > size then do
178-      hSeek fileHandle AbsoluteSeek (getFirst cs)
179+      -- go to the end of the index
180+      hSeek fileHandle AbsoluteSeek (getLatestIdx cs)
181       -- try to read until the requested line number
182       idxTail <- readUntil (ln - size) []
183       let
184         newSize = size + (fromIntegral $ length idxTail)
185         (newIdx, newArr) =
186+            -- if we have exceed the list capacity
187             if newSize > listCapacity
188+            -- move the list content to the array and empty the list
189             then
190-              let res = (idxTail ++ idx) ++ mElems arr
191+              let res = (idxTail ++ idx) ++ maybe [] A.elems arr
192               in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
193+            -- otherwise add offsets to the list
194             else (idxTail ++ idx, arr)
195         newState = CursorState
196           { cursorLinesIdx = newIdx
197@@ -169,23 +192,19 @@ goToLine' ch@CursorHandle{..} ln =
198           , cursorCurrentLineNumber = newSize
199           }
200       pure (newState, newSize)
201+    -- otherwise access the offset in the cache (list + array)
202     else do
203       let nextSeekIndex = size - ln
204-
205+      -- if the seek index is bigger than the current list size
206       if nextSeekIndex >= fromIntegral (length idx)
207+      -- try to access the array
208       then case arr of
209         Just a -> hSeek fileHandle AbsoluteSeek (a A.! (nextSeekIndex - fromIntegral (length idx)))
210         Nothing -> error "goToLine: there is no array"
211+      -- otherwise take the offset from the list
212       else hSeek fileHandle AbsoluteSeek (idx !! fromIntegral nextSeekIndex)
213 
214-      let
215-        newState = CursorState
216-          { cursorLinesIdx = idx
217-          , cursorLinesArrIdx = arr
218-          , cursorIdxSize = size
219-          , cursorCurrentLineNumber = ln
220-          }
221-      pure (newState, ln)
222+      pure (cs { cursorCurrentLineNumber = ln } , ln)
223   where
224     readUntil 0 idx = pure idx
225     readUntil counter idx =
226@@ -194,6 +213,13 @@ goToLine' ch@CursorHandle{..} ln =
227         offset <- hTell fileHandle
228         readUntil (counter - 1) (fromInteger offset : idx)
229 
230-getFirst :: CursorState -> Integer
231-getFirst (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0
232-getFirst (CursorState idx Nothing _ _) = idx !! 0
233+getCursorState' :: CursorHandle -> IO [Integer]
234+getCursorState' CursorHandle{..} = do
235+  CursorState l arr _ _ <- readMVar cursorState
236+  pure $ reverse $ l ++ maybe [] A.elems arr
237+
238+-- Utils
239+
240+getLatestIdx :: CursorState -> Integer
241+getLatestIdx (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0
242+getLatestIdx (CursorState idx Nothing _ _) = idx !! 0
M test/Main.hs
+3, -0
 1@@ -81,6 +81,9 @@ main = hspec $ do
 2           l' <- getCurrentLine c
 3           l' `shouldBe` Nothing
 4 
 5+          s <- getCursorState c
 6+          s `shouldBe` [0,57,117,191,244,316,384,429,511,561,616,668,715,761,799,851,907,941,981,1024,1068]
 7+
 8         it "read line, then go to beginning and forth" $ \(_, c) -> do
 9           cln <- getCurrentLineNumber c
10           cln `shouldBe` 0