repos / line-indexed-cursor.hs.git


line-indexed-cursor.hs.git / src / System / IO
Evgenii Akentev  ·  2023-07-23

LineIndexedCursor.hs

  1{-# LANGUAGE DuplicateRecordFields #-}
  2{-# LANGUAGE RecordWildCards #-}
  3
  4-----------------------------------------------------------------------------
  5-- |
  6-- Module      :  System.IO.LineIndexedCursor
  7-- Maintainer  :  i@ak3n.com
  8--
  9-- Line-indexed file reader.
 10--
 11-- Lazily builds the index with the line numbers while reading the file
 12-- making it possible to rewind to them quickly later.
 13--
 14-- The indices are stored using a list, with a fixed length capacity, and 'Data.Array'.
 15-- Each new element is added to the list and when the capacity is reached, the list items get flushed
 16-- to the array for faster access. The list's capacity can be configured using 'mkLineIndexedCursorWithCapacity'.
 17-----------------------------------------------------------------------------
 18
 19module System.IO.LineIndexedCursor
 20  ( LineIndexedCursor(..)
 21  , mkLineIndexedCursor
 22  , mkLineIndexedCursorWithCapacity
 23  ) where
 24
 25import Data.Maybe (fromMaybe)
 26import qualified Data.Array as A
 27import Data.ByteString (ByteString, hGetLine)
 28import Control.Concurrent.MVar (MVar, newMVar, readMVar, modifyMVar, modifyMVar_)
 29import System.IO (Handle, hTell, hSeek, SeekMode(..), hIsEOF)
 30
 31defaultListCapacity :: Integer
 32defaultListCapacity = 16384
 33
 34-- | ADT with methods, hiding the internal state.
 35--
 36-- 'LineIndexedCursor.getCurrentLine', 'LineIndexedCursor.getCurrentLineUnsafe',
 37-- 'LineIndexedCursor.doFullScan', and 'LineIndexedCursor.goToLine', all throw 'System.IO.IOError'.
 38data LineIndexedCursor = LineIndexedCursor
 39  {
 40  -- | Same as 'LineIndexedCursor.getCurrentLineUnsafe' but safely handles 'System.IO.EOF'.
 41  getCurrentLine :: IO (Maybe ByteString)
 42
 43  -- | A wrapper around 'hGetLine'. Throws the same exceptions.
 44  , getCurrentLineUnsafe :: IO ByteString
 45
 46  -- | Returns the current line number.
 47  , getCurrentLineNumber :: IO Integer
 48
 49  -- | Reads from the latest known line until EOF to build the full index.
 50  , doFullScan :: IO ()
 51
 52  -- | Rewinds the file handle to the requsted line number. Stops at the EOF if it's too big,
 53  -- returning the reached line number.
 54  , goToLine :: Integer -> IO Integer
 55
 56  -- | Returns the file 'Handle'.
 57  , getHandle :: Handle
 58
 59  -- | Returns the current state of the cursor — all known line indexes.
 60  , getCursorState :: IO [Integer]
 61  }
 62
 63data CursorHandle = CursorHandle
 64  { fileHandle :: Handle
 65  , cursorState :: MVar CursorState
 66  , listCapacity :: Integer
 67  }
 68
 69data CursorState = CursorState
 70  { cursorLinesIdx :: ![Integer]
 71  , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer)) -- uses Maybe since can't be empty
 72  , cursorIdxSize :: !Integer
 73  , cursorCurrentLineNumber :: !Integer
 74  }
 75
 76{- |
 77
 78Builds 'LineIndexedCursor'. Resets the file handle's ofsset to the beginning.
 79
 80Use 'System.IO.hSetNewlineMode' if you want to configure 'System.IO.NewlineMode'.
 81
 82-}
 83mkLineIndexedCursor :: Handle -> IO LineIndexedCursor
 84mkLineIndexedCursor = flip mkLineIndexedCursorWithCapacity defaultListCapacity
 85
 86-- | Same as 'mkLineIndexedCursor' but allows to configure the list's capacity.
 87mkLineIndexedCursorWithCapacity :: Handle -> Integer -> IO LineIndexedCursor
 88mkLineIndexedCursorWithCapacity fileHandle listCapacity = do
 89  -- reset the handle's offset to the beginning
 90  hSeek fileHandle AbsoluteSeek 0
 91
 92  cursorState <- newMVar $ CursorState [0] Nothing 0 0
 93
 94  let cursorHandle = CursorHandle fileHandle cursorState listCapacity
 95  pure $ LineIndexedCursor
 96    { getCurrentLine = getCurrentLine' cursorHandle
 97    , getCurrentLineUnsafe = getCurrentLineUnsafe' cursorHandle
 98    , getCurrentLineNumber = getCurrentLineNumber' cursorHandle
 99    , doFullScan = doFullScan' cursorHandle
100    , goToLine = goToLine' cursorHandle
101    , getHandle = fileHandle
102    , getCursorState = getCursorState' cursorHandle
103    }
104
105getCurrentLine' :: CursorHandle -> IO (Maybe ByteString)
106getCurrentLine' CursorHandle{..} =
107  hIsEOF fileHandle >>= \isEOF -> if isEOF then pure Nothing else do
108    line <- hGetLine fileHandle
109    offset <- hTell fileHandle
110
111    modifyMVar_ cursorState $ \cs@(CursorState idx arr size cln) ->
112      let latestIdx = getLatestIdx cs
113      in pure $
114        if (offset <= latestIdx)
115        -- we already know this offset, so just increment the current line number
116        then cs { cursorCurrentLineNumber = cln + 1 }
117        -- otherwise we need to add the offset
118        else
119          let
120            (newIdx, newArr) =
121              -- if we have exceed the list capacity
122              if length (offset : idx) > fromIntegral listCapacity
123              -- move the list content to the array and empty the list
124              then
125                let res = (offset : idx) ++ maybe [] A.elems arr
126                in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
127              -- otherwise keep the offset in the list
128              else (offset : idx, arr)
129          in CursorState
130          { cursorLinesIdx = newIdx
131          , cursorLinesArrIdx = newArr
132          , cursorIdxSize = size + 1
133          , cursorCurrentLineNumber = cln + 1
134          }
135    pure $ Just line
136
137getCurrentLineUnsafe' :: CursorHandle -> IO ByteString
138getCurrentLineUnsafe' ch = do
139  cl <- getCurrentLine' ch
140  pure $ fromMaybe (error "getCurrentLineUnsafe: couldn't get the current line") cl
141
142doFullScan' :: CursorHandle -> IO ()
143doFullScan' CursorHandle{..} = do
144  modifyMVar_ cursorState $ \cs@(CursorState idx arr size _) -> do
145    -- go to the end of the index
146    hSeek fileHandle AbsoluteSeek (getLatestIdx cs)
147    -- try to read until the EOF
148    idxTail <- readUntilEOF []
149    let
150      newSize = size + (fromIntegral $ length idxTail)
151      newState = CursorState
152          { cursorLinesIdx = idxTail ++ idx
153          , cursorLinesArrIdx = arr
154          , cursorIdxSize = newSize
155          , cursorCurrentLineNumber = newSize
156          }
157    pure newState
158  where
159    readUntilEOF idx =
160      hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do
161        _ <- hGetLine fileHandle
162        offset <- hTell fileHandle
163        readUntilEOF (fromInteger offset : idx)
164
165getCurrentLineNumber' :: CursorHandle -> IO Integer
166getCurrentLineNumber' CursorHandle{..} = do
167  CursorState _ _ _ cln <- readMVar cursorState
168  pure cln
169
170goToLine' :: CursorHandle -> Integer -> IO Integer
171goToLine' ch@CursorHandle{..} ln =
172  -- handle negative input
173  if (ln < 0) then getCurrentLineNumber' ch
174  else modifyMVar cursorState $ \cs@(CursorState idx arr size _) -> do
175    -- if the requested line number is out of the index's scope
176    if ln > size then do
177      -- go to the end of the index
178      hSeek fileHandle AbsoluteSeek (getLatestIdx cs)
179      -- try to read until the requested line number
180      idxTail <- readUntil (ln - size) []
181      let
182        newSize = size + (fromIntegral $ length idxTail)
183        (newIdx, newArr) =
184            -- if we have exceed the list capacity
185            if newSize > listCapacity
186            -- move the list content to the array and empty the list
187            then
188              let res = (idxTail ++ idx) ++ maybe [] A.elems arr
189              in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
190            -- otherwise add offsets to the list
191            else (idxTail ++ idx, arr)
192        newState = CursorState
193          { cursorLinesIdx = newIdx
194          , cursorLinesArrIdx = newArr
195          , cursorIdxSize = newSize
196          , cursorCurrentLineNumber = newSize
197          }
198      pure (newState, newSize)
199    -- otherwise access the offset in the cache (list + array)
200    else do
201      let nextSeekIndex = size - ln
202      -- if the seek index is bigger than the current list size
203      if nextSeekIndex >= fromIntegral (length idx)
204      -- try to access the array
205      then case arr of
206        Just a -> hSeek fileHandle AbsoluteSeek (a A.! (nextSeekIndex - fromIntegral (length idx)))
207        Nothing -> error "goToLine: there is no array"
208      -- otherwise take the offset from the list
209      else hSeek fileHandle AbsoluteSeek (idx !! fromIntegral nextSeekIndex)
210
211      pure (cs { cursorCurrentLineNumber = ln } , ln)
212  where
213    readUntil 0 idx = pure idx
214    readUntil counter idx =
215      hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do
216        _ <- hGetLine fileHandle
217        offset <- hTell fileHandle
218        readUntil (counter - 1) (fromInteger offset : idx)
219
220getCursorState' :: CursorHandle -> IO [Integer]
221getCursorState' CursorHandle{..} = do
222  CursorState l arr _ _ <- readMVar cursorState
223  pure $ reverse $ l ++ maybe [] A.elems arr
224
225-- Utils
226
227getLatestIdx :: CursorState -> Integer
228getLatestIdx (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0
229getLatestIdx (CursorState idx Nothing _ _) = idx !! 0