Evgenii Akentev
·
2023-07-23
LineIndexedCursor.hs
1{-# LANGUAGE DuplicateRecordFields #-}
2{-# LANGUAGE RecordWildCards #-}
3
4-----------------------------------------------------------------------------
5-- |
6-- Module : System.IO.LineIndexedCursor
7-- Maintainer : i@ak3n.com
8--
9-- Line-indexed file reader.
10--
11-- Lazily builds the index with the line numbers while reading the file
12-- making it possible to rewind to them quickly later.
13--
14-- The indices are stored using a list, with a fixed length capacity, and 'Data.Array'.
15-- Each new element is added to the list and when the capacity is reached, the list items get flushed
16-- to the array for faster access. The list's capacity can be configured using 'mkLineIndexedCursorWithCapacity'.
17-----------------------------------------------------------------------------
18
19module System.IO.LineIndexedCursor
20 ( LineIndexedCursor(..)
21 , mkLineIndexedCursor
22 , mkLineIndexedCursorWithCapacity
23 ) where
24
25import Data.Maybe (fromMaybe)
26import qualified Data.Array as A
27import Data.ByteString (ByteString, hGetLine)
28import Control.Concurrent.MVar (MVar, newMVar, readMVar, modifyMVar, modifyMVar_)
29import System.IO (Handle, hTell, hSeek, SeekMode(..), hIsEOF)
30
31defaultListCapacity :: Integer
32defaultListCapacity = 16384
33
34-- | ADT with methods, hiding the internal state.
35--
36-- 'LineIndexedCursor.getCurrentLine', 'LineIndexedCursor.getCurrentLineUnsafe',
37-- 'LineIndexedCursor.doFullScan', and 'LineIndexedCursor.goToLine', all throw 'System.IO.IOError'.
38data LineIndexedCursor = LineIndexedCursor
39 {
40 -- | Same as 'LineIndexedCursor.getCurrentLineUnsafe' but safely handles 'System.IO.EOF'.
41 getCurrentLine :: IO (Maybe ByteString)
42
43 -- | A wrapper around 'hGetLine'. Throws the same exceptions.
44 , getCurrentLineUnsafe :: IO ByteString
45
46 -- | Returns the current line number.
47 , getCurrentLineNumber :: IO Integer
48
49 -- | Reads from the latest known line until EOF to build the full index.
50 , doFullScan :: IO ()
51
52 -- | Rewinds the file handle to the requsted line number. Stops at the EOF if it's too big,
53 -- returning the reached line number.
54 , goToLine :: Integer -> IO Integer
55
56 -- | Returns the file 'Handle'.
57 , getHandle :: Handle
58
59 -- | Returns the current state of the cursor — all known line indexes.
60 , getCursorState :: IO [Integer]
61 }
62
63data CursorHandle = CursorHandle
64 { fileHandle :: Handle
65 , cursorState :: MVar CursorState
66 , listCapacity :: Integer
67 }
68
69data CursorState = CursorState
70 { cursorLinesIdx :: ![Integer]
71 , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer)) -- uses Maybe since can't be empty
72 , cursorIdxSize :: !Integer
73 , cursorCurrentLineNumber :: !Integer
74 }
75
76{- |
77
78Builds 'LineIndexedCursor'. Resets the file handle's ofsset to the beginning.
79
80Use 'System.IO.hSetNewlineMode' if you want to configure 'System.IO.NewlineMode'.
81
82-}
83mkLineIndexedCursor :: Handle -> IO LineIndexedCursor
84mkLineIndexedCursor = flip mkLineIndexedCursorWithCapacity defaultListCapacity
85
86-- | Same as 'mkLineIndexedCursor' but allows to configure the list's capacity.
87mkLineIndexedCursorWithCapacity :: Handle -> Integer -> IO LineIndexedCursor
88mkLineIndexedCursorWithCapacity fileHandle listCapacity = do
89 -- reset the handle's offset to the beginning
90 hSeek fileHandle AbsoluteSeek 0
91
92 cursorState <- newMVar $ CursorState [0] Nothing 0 0
93
94 let cursorHandle = CursorHandle fileHandle cursorState listCapacity
95 pure $ LineIndexedCursor
96 { getCurrentLine = getCurrentLine' cursorHandle
97 , getCurrentLineUnsafe = getCurrentLineUnsafe' cursorHandle
98 , getCurrentLineNumber = getCurrentLineNumber' cursorHandle
99 , doFullScan = doFullScan' cursorHandle
100 , goToLine = goToLine' cursorHandle
101 , getHandle = fileHandle
102 , getCursorState = getCursorState' cursorHandle
103 }
104
105getCurrentLine' :: CursorHandle -> IO (Maybe ByteString)
106getCurrentLine' CursorHandle{..} =
107 hIsEOF fileHandle >>= \isEOF -> if isEOF then pure Nothing else do
108 line <- hGetLine fileHandle
109 offset <- hTell fileHandle
110
111 modifyMVar_ cursorState $ \cs@(CursorState idx arr size cln) ->
112 let latestIdx = getLatestIdx cs
113 in pure $
114 if (offset <= latestIdx)
115 -- we already know this offset, so just increment the current line number
116 then cs { cursorCurrentLineNumber = cln + 1 }
117 -- otherwise we need to add the offset
118 else
119 let
120 (newIdx, newArr) =
121 -- if we have exceed the list capacity
122 if length (offset : idx) > fromIntegral listCapacity
123 -- move the list content to the array and empty the list
124 then
125 let res = (offset : idx) ++ maybe [] A.elems arr
126 in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
127 -- otherwise keep the offset in the list
128 else (offset : idx, arr)
129 in CursorState
130 { cursorLinesIdx = newIdx
131 , cursorLinesArrIdx = newArr
132 , cursorIdxSize = size + 1
133 , cursorCurrentLineNumber = cln + 1
134 }
135 pure $ Just line
136
137getCurrentLineUnsafe' :: CursorHandle -> IO ByteString
138getCurrentLineUnsafe' ch = do
139 cl <- getCurrentLine' ch
140 pure $ fromMaybe (error "getCurrentLineUnsafe: couldn't get the current line") cl
141
142doFullScan' :: CursorHandle -> IO ()
143doFullScan' CursorHandle{..} = do
144 modifyMVar_ cursorState $ \cs@(CursorState idx arr size _) -> do
145 -- go to the end of the index
146 hSeek fileHandle AbsoluteSeek (getLatestIdx cs)
147 -- try to read until the EOF
148 idxTail <- readUntilEOF []
149 let
150 newSize = size + (fromIntegral $ length idxTail)
151 newState = CursorState
152 { cursorLinesIdx = idxTail ++ idx
153 , cursorLinesArrIdx = arr
154 , cursorIdxSize = newSize
155 , cursorCurrentLineNumber = newSize
156 }
157 pure newState
158 where
159 readUntilEOF idx =
160 hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do
161 _ <- hGetLine fileHandle
162 offset <- hTell fileHandle
163 readUntilEOF (fromInteger offset : idx)
164
165getCurrentLineNumber' :: CursorHandle -> IO Integer
166getCurrentLineNumber' CursorHandle{..} = do
167 CursorState _ _ _ cln <- readMVar cursorState
168 pure cln
169
170goToLine' :: CursorHandle -> Integer -> IO Integer
171goToLine' ch@CursorHandle{..} ln =
172 -- handle negative input
173 if (ln < 0) then getCurrentLineNumber' ch
174 else modifyMVar cursorState $ \cs@(CursorState idx arr size _) -> do
175 -- if the requested line number is out of the index's scope
176 if ln > size then do
177 -- go to the end of the index
178 hSeek fileHandle AbsoluteSeek (getLatestIdx cs)
179 -- try to read until the requested line number
180 idxTail <- readUntil (ln - size) []
181 let
182 newSize = size + (fromIntegral $ length idxTail)
183 (newIdx, newArr) =
184 -- if we have exceed the list capacity
185 if newSize > listCapacity
186 -- move the list content to the array and empty the list
187 then
188 let res = (idxTail ++ idx) ++ maybe [] A.elems arr
189 in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
190 -- otherwise add offsets to the list
191 else (idxTail ++ idx, arr)
192 newState = CursorState
193 { cursorLinesIdx = newIdx
194 , cursorLinesArrIdx = newArr
195 , cursorIdxSize = newSize
196 , cursorCurrentLineNumber = newSize
197 }
198 pure (newState, newSize)
199 -- otherwise access the offset in the cache (list + array)
200 else do
201 let nextSeekIndex = size - ln
202 -- if the seek index is bigger than the current list size
203 if nextSeekIndex >= fromIntegral (length idx)
204 -- try to access the array
205 then case arr of
206 Just a -> hSeek fileHandle AbsoluteSeek (a A.! (nextSeekIndex - fromIntegral (length idx)))
207 Nothing -> error "goToLine: there is no array"
208 -- otherwise take the offset from the list
209 else hSeek fileHandle AbsoluteSeek (idx !! fromIntegral nextSeekIndex)
210
211 pure (cs { cursorCurrentLineNumber = ln } , ln)
212 where
213 readUntil 0 idx = pure idx
214 readUntil counter idx =
215 hIsEOF fileHandle >>= \isEOF -> if isEOF then pure idx else do
216 _ <- hGetLine fileHandle
217 offset <- hTell fileHandle
218 readUntil (counter - 1) (fromInteger offset : idx)
219
220getCursorState' :: CursorHandle -> IO [Integer]
221getCursorState' CursorHandle{..} = do
222 CursorState l arr _ _ <- readMVar cursorState
223 pure $ reverse $ l ++ maybe [] A.elems arr
224
225-- Utils
226
227getLatestIdx :: CursorState -> Integer
228getLatestIdx (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0
229getLatestIdx (CursorState idx Nothing _ _) = idx !! 0