- commit
- 0e109fc
- parent
- 315fdf5
- author
- Evgenii Akentev
- date
- 2023-07-23 01:11:56 +0400 +04
Add: comments, 'getCurrentLineUnsafe', 'getCursorState'. Refactor a bit.
2 files changed,
+81,
-52
+78,
-52
1@@ -8,14 +8,17 @@
2 --
3 -- Line-indexed file reader.
4 --
5--- Lazily builds the index of line numbers while reading the file
6+-- Lazily builds the index with the line numbers while reading the file
7 -- making it possible to rewind to them quickly later.
8 -----------------------------------------------------------------------------
9
10-module System.IO.LineIndexedCursor (
11- LineIndexedCursor(..), mkLineIndexedCursor, mkLineIndexedCursorWithCapacity
12+module System.IO.LineIndexedCursor
13+ ( LineIndexedCursor(..)
14+ , mkLineIndexedCursor
15+ , mkLineIndexedCursorWithCapacity
16 ) where
17
18+import Data.Maybe (fromMaybe)
19 import qualified Data.Array as A
20 import Data.ByteString (ByteString, hGetLine)
21 import Control.Concurrent.MVar
22@@ -25,23 +28,32 @@ defaultListCapacity :: Integer
23 defaultListCapacity = 16384
24
25 -- | ADT with methods, hiding the internal state.
26+--
27+-- 'LineIndexedCursor.getCurrentLine', 'LineIndexedCursor.getCurrentLineUnsafe',
28+-- 'LineIndexedCursor.doFullScan', and 'LineIndexedCursor.goToLine', all throw 'System.IO.IOError'.
29 data LineIndexedCursor = LineIndexedCursor
30 {
31- -- | Same as 'hGetLine' but safe.
32+ -- | Same as 'LineIndexedCursor.getCurrentLineUnsafe' but safely handles 'System.IO.EOF'.
33 getCurrentLine :: IO (Maybe ByteString)
34
35- -- | Returns current line number.
36+ -- | A wrapper around 'hGetLine'. Throws the same exceptions.
37+ , getCurrentLineUnsafe :: IO ByteString
38+
39+ -- | Returns the current line number.
40 , getCurrentLineNumber :: IO Integer
41
42- -- | Reads from the latest line from index until EOF to build the full index.
43+ -- | Reads from the latest known line until EOF to build the full index.
44 , doFullScan :: IO ()
45
46- -- | Rewinds to the requsted line number. Stops at EOF if it's too big.
47- -- Returns the reached line number.
48+ -- | Rewinds the file handle to the requsted line number. Stops at the EOF if it's too big,
49+ -- returning the reached line number.
50 , goToLine :: Integer -> IO Integer
51
52 -- | Returns the file 'Handle'.
53 , getHandle :: Handle
54+
55+ -- | Returns the current state of the cursor — all known line indexes.
56+ , getCursorState :: IO [Integer]
57 }
58
59 data CursorHandle = CursorHandle
60@@ -52,19 +64,14 @@ data CursorHandle = CursorHandle
61
62 data CursorState = CursorState
63 { cursorLinesIdx :: ![Integer]
64- , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer))
65+ , cursorLinesArrIdx :: !(Maybe (A.Array Integer Integer)) -- uses Maybe since can't be empty
66 , cursorIdxSize :: !Integer
67 , cursorCurrentLineNumber :: !Integer
68 }
69
70-mElems :: (Maybe (A.Array Integer Integer)) -> [Integer]
71-mElems = maybe [] A.elems
72-
73 {- |
74
75-Builds 'LineIndexedCursor'.
76-
77-Resets the file handle's ofsset to the beginning.
78+Builds 'LineIndexedCursor'. Resets the file handle's ofsset to the beginning.
79
80 Use 'System.IO.hSetNewlineMode' if you want to configure 'System.IO.NewlineMode'.
81
82@@ -72,6 +79,7 @@ Use 'System.IO.hSetNewlineMode' if you want to configure 'System.IO.NewlineMode'
83 mkLineIndexedCursor :: Handle -> IO LineIndexedCursor
84 mkLineIndexedCursor = flip mkLineIndexedCursorWithCapacity defaultListCapacity
85
86+-- | Same as 'mkLineIndexedCursor' but allows to configure the list's capacity.
87 mkLineIndexedCursorWithCapacity :: Handle -> Integer -> IO LineIndexedCursor
88 mkLineIndexedCursorWithCapacity fileHandle listCapacity = do
89 -- reset the handle's offset to the beginning
90@@ -82,10 +90,12 @@ mkLineIndexedCursorWithCapacity fileHandle listCapacity = do
91 let cursorHandle = CursorHandle fileHandle cursorState listCapacity
92 pure $ LineIndexedCursor
93 { getCurrentLine = getCurrentLine' cursorHandle
94+ , getCurrentLineUnsafe = getCurrentLineUnsafe' cursorHandle
95 , getCurrentLineNumber = getCurrentLineNumber' cursorHandle
96 , doFullScan = doFullScan' cursorHandle
97 , goToLine = goToLine' cursorHandle
98 , getHandle = fileHandle
99+ , getCursorState = getCursorState' cursorHandle
100 }
101
102 getCurrentLine' :: CursorHandle -> IO (Maybe ByteString)
103@@ -94,35 +104,42 @@ getCurrentLine' CursorHandle{..} =
104 line <- hGetLine fileHandle
105 offset <- hTell fileHandle
106
107- modifyMVar_ cursorState $ \(CursorState idx arr size cln) -> pure $
108- if (not $ offset `elem` idx)
109- then
110- let
111- (newIdx, newArr) =
112- if length (offset : idx) > fromIntegral listCapacity
113- then
114- let res = (offset : idx) ++ mElems arr
115- in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
116- else (offset : idx, arr)
117- in CursorState
118- { cursorLinesIdx = newIdx
119- , cursorLinesArrIdx = newArr
120- , cursorIdxSize = size + 1
121- , cursorCurrentLineNumber = cln + 1
122- }
123- else CursorState
124- { cursorLinesIdx = idx
125- , cursorLinesArrIdx = arr
126- , cursorIdxSize = size
127- , cursorCurrentLineNumber = cln + 1
128- }
129+ modifyMVar_ cursorState $ \cs@(CursorState idx arr size cln) ->
130+ let latestIdx = getLatestIdx cs
131+ in pure $
132+ if (offset <= latestIdx)
133+ -- we already know this offset, so just increment the current line number
134+ then cs { cursorCurrentLineNumber = cln + 1 }
135+ -- otherwise we need to add the offset
136+ else
137+ let
138+ (newIdx, newArr) =
139+ -- if we have exceed the list capacity
140+ if length (offset : idx) > fromIntegral listCapacity
141+ -- move the list content to the array and empty the list
142+ then
143+ let res = (offset : idx) ++ maybe [] A.elems arr
144+ in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
145+ -- otherwise keep the offset in the list
146+ else (offset : idx, arr)
147+ in CursorState
148+ { cursorLinesIdx = newIdx
149+ , cursorLinesArrIdx = newArr
150+ , cursorIdxSize = size + 1
151+ , cursorCurrentLineNumber = cln + 1
152+ }
153 pure $ Just line
154
155+getCurrentLineUnsafe' :: CursorHandle -> IO ByteString
156+getCurrentLineUnsafe' ch = do
157+ cl <- getCurrentLine' ch
158+ pure $ fromMaybe (error "getCurrentLineUnsafe: couldn't get the current line") cl
159+
160 doFullScan' :: CursorHandle -> IO ()
161 doFullScan' CursorHandle{..} = do
162 modifyMVar_ cursorState $ \cs@(CursorState idx arr size _) -> do
163 -- go to the end of the index
164- hSeek fileHandle AbsoluteSeek (getFirst cs)
165+ hSeek fileHandle AbsoluteSeek (getLatestIdx cs)
166 -- try to read until the EOF
167 idxTail <- readUntilEOF []
168 let
169@@ -148,19 +165,25 @@ getCurrentLineNumber' CursorHandle{..} = do
170
171 goToLine' :: CursorHandle -> Integer -> IO Integer
172 goToLine' ch@CursorHandle{..} ln =
173+ -- handle negative input
174 if (ln < 0) then getCurrentLineNumber' ch
175 else modifyMVar cursorState $ \cs@(CursorState idx arr size _) -> do
176+ -- if the requested line number is out of the index's scope
177 if ln > size then do
178- hSeek fileHandle AbsoluteSeek (getFirst cs)
179+ -- go to the end of the index
180+ hSeek fileHandle AbsoluteSeek (getLatestIdx cs)
181 -- try to read until the requested line number
182 idxTail <- readUntil (ln - size) []
183 let
184 newSize = size + (fromIntegral $ length idxTail)
185 (newIdx, newArr) =
186+ -- if we have exceed the list capacity
187 if newSize > listCapacity
188+ -- move the list content to the array and empty the list
189 then
190- let res = (idxTail ++ idx) ++ mElems arr
191+ let res = (idxTail ++ idx) ++ maybe [] A.elems arr
192 in ([], Just $ A.listArray (0, toInteger $ length res - 1) res)
193+ -- otherwise add offsets to the list
194 else (idxTail ++ idx, arr)
195 newState = CursorState
196 { cursorLinesIdx = newIdx
197@@ -169,23 +192,19 @@ goToLine' ch@CursorHandle{..} ln =
198 , cursorCurrentLineNumber = newSize
199 }
200 pure (newState, newSize)
201+ -- otherwise access the offset in the cache (list + array)
202 else do
203 let nextSeekIndex = size - ln
204-
205+ -- if the seek index is bigger than the current list size
206 if nextSeekIndex >= fromIntegral (length idx)
207+ -- try to access the array
208 then case arr of
209 Just a -> hSeek fileHandle AbsoluteSeek (a A.! (nextSeekIndex - fromIntegral (length idx)))
210 Nothing -> error "goToLine: there is no array"
211+ -- otherwise take the offset from the list
212 else hSeek fileHandle AbsoluteSeek (idx !! fromIntegral nextSeekIndex)
213
214- let
215- newState = CursorState
216- { cursorLinesIdx = idx
217- , cursorLinesArrIdx = arr
218- , cursorIdxSize = size
219- , cursorCurrentLineNumber = ln
220- }
221- pure (newState, ln)
222+ pure (cs { cursorCurrentLineNumber = ln } , ln)
223 where
224 readUntil 0 idx = pure idx
225 readUntil counter idx =
226@@ -194,6 +213,13 @@ goToLine' ch@CursorHandle{..} ln =
227 offset <- hTell fileHandle
228 readUntil (counter - 1) (fromInteger offset : idx)
229
230-getFirst :: CursorState -> Integer
231-getFirst (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0
232-getFirst (CursorState idx Nothing _ _) = idx !! 0
233+getCursorState' :: CursorHandle -> IO [Integer]
234+getCursorState' CursorHandle{..} = do
235+ CursorState l arr _ _ <- readMVar cursorState
236+ pure $ reverse $ l ++ maybe [] A.elems arr
237+
238+-- Utils
239+
240+getLatestIdx :: CursorState -> Integer
241+getLatestIdx (CursorState idx (Just arr) _ _) = if null idx then arr A.! 0 else idx !! 0
242+getLatestIdx (CursorState idx Nothing _ _) = idx !! 0
+3,
-0
1@@ -81,6 +81,9 @@ main = hspec $ do
2 l' <- getCurrentLine c
3 l' `shouldBe` Nothing
4
5+ s <- getCursorState c
6+ s `shouldBe` [0,57,117,191,244,316,384,429,511,561,616,668,715,761,799,851,907,941,981,1024,1068]
7+
8 it "read line, then go to beginning and forth" $ \(_, c) -> do
9 cln <- getCurrentLineNumber c
10 cln `shouldBe` 0