Skip to content

Commit abf6392

Browse files
committed
[FileFormats.MPS] improve performance of parsing each line
1 parent ae95d6a commit abf6392

2 files changed

Lines changed: 135 additions & 49 deletions

File tree

src/FileFormats/MPS/read.jl

Lines changed: 126 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -156,9 +156,7 @@ end
156156
HEADER_INDICATORS,
157157
)
158158

159-
# `Headers` gets called _alot_ (on every line), so we try very hard to be
160-
# efficient.
161-
function Headers(s)
159+
function parse_single_header(s::AbstractString)
162160
N = length(s)
163161
x = first(s)
164162
if N == 3
@@ -180,7 +178,7 @@ function Headers(s)
180178
return HEADER_ENDATA
181179
end
182180
elseif N == 7
183-
if (x == 'C' || x == 'c') && (uppercase(s) == "COLUMNS")
181+
if (x == 'C' || x == 'c') && uppercase(s) == "COLUMNS"
184182
return HEADER_COLUMNS
185183
elseif (x == 'Q' || x == 'q')
186184
header = uppercase(s)
@@ -190,34 +188,125 @@ function Headers(s)
190188
return HEADER_QMATRIX
191189
end
192190
end
193-
elseif N >= 8
194-
if (x == 'O' || x == 'o') && startswith(uppercase(s), "OBJSENSE")
191+
elseif N == 8
192+
if (x == 'O' || x == 'o') && uppercase(s) == "OBJSENSE"
195193
return HEADER_OBJSENSE
196-
elseif (x == 'Q' || x == 'q')
197-
header = uppercase(s)
198-
if startswith(header, "QCMATRIX")
199-
return HEADER_QCMATRIX
200-
elseif startswith(header, "QSECTION")
201-
return HEADER_QSECTION
194+
end
195+
elseif N == 10
196+
if (x == 'I' || x == 'i') && uppercase(s) == "INDICATORS"
197+
return HEADER_INDICATORS
198+
end
199+
end
200+
return HEADER_UNKNOWN
201+
end
202+
203+
function parse_double_header(s::AbstractString)
204+
N = length(s)
205+
x = first(s)
206+
if N != 8
207+
return HEADER_UNKNOWN
208+
elseif (x == 'O' || x == 'o') && uppercase(s) == "OBJSENSE"
209+
return HEADER_OBJSENSE
210+
elseif (x == 'Q' || x == 'q')
211+
header = uppercase(s)
212+
if startswith(header, "QCMATRIX")
213+
return HEADER_QCMATRIX
214+
elseif startswith(header, "QSECTION")
215+
return HEADER_QSECTION
216+
end
217+
end
218+
return HEADER_UNKNOWN
219+
end
220+
221+
"""
222+
LineToItems(line::String)
223+
224+
Split on any whitespace characters. We can't split only on `' '` because at
225+
least one models in MIPLIB has `\t` as a separator.
226+
227+
This decision assumes that we are parsing a free MPS file, where whitespace is
228+
disallowed in names. If this ever becomes a problem, we could change to the
229+
fixed MPS format, where the files are split at the usual offsets.
230+
231+
This function is a more performant version of:
232+
```julia
233+
LineToItems(line::String) = split(line, r"\\s"; keepempty = false)
234+
```
235+
"""
236+
struct LineToItems
237+
line::String
238+
nfields::Int
239+
fields::NTuple{5,UnitRange{Int}}
240+
241+
function LineToItems(line::String)
242+
nfields, f1, f2, f3, f4, f5 = 0, 0:0, 0:0, 0:0, 0:0, 0:0
243+
start, in_field = -1, false
244+
n = ncodeunits(line)
245+
for i in 1:n
246+
if isspace(line[i])
247+
if in_field
248+
nfields += 1
249+
if nfields == 1
250+
f1 = start:(i-1)
251+
elseif nfields == 2
252+
f2 = start:(i-1)
253+
elseif nfields == 3
254+
f3 = start:(i-1)
255+
elseif nfields == 4
256+
f4 = start:(i-1)
257+
elseif nfields == 5
258+
f5 = start:(i-1)
259+
end
260+
in_field = false
261+
end
262+
elseif !in_field
263+
start = i
264+
in_field = true
202265
end
203-
elseif N == 10
204-
if (x == 'I' || x == 'i') && uppercase(s) == "INDICATORS"
205-
return HEADER_INDICATORS
266+
end
267+
if in_field
268+
nfields += 1
269+
if nfields == 1
270+
f1 = start:n
271+
elseif nfields == 2
272+
f2 = start:n
273+
elseif nfields == 3
274+
f3 = start:n
275+
elseif nfields == 4
276+
f4 = start:n
277+
elseif nfields == 5
278+
f5 = start:n
206279
end
207280
end
281+
return new(line, nfields, (f1, f2, f3, f4, f5))
208282
end
209-
return HEADER_UNKNOWN
210283
end
211284

212-
function line_to_items(line)
213-
# Split on any whitespace characters. We can't split only on `' '` because
214-
# at least one models in MIPLIB has `\t` as a separator.
215-
#
216-
# This decision assumes that we are parsing a free MPS file, where
217-
# whitespace is disallowed in names. If this ever becomes a problem, we
218-
# could change to the fixed MPS format, where the files are split at the
219-
# usual offsets.
220-
return split(line, r"\s"; keepempty = false)
285+
Base.length(x::LineToItems) = x.nfields
286+
287+
function Base.getindex(x::LineToItems, i::Int)
288+
@assert 1 <= i <= x.nfields
289+
return SubString(x.line, x.fields[i])
290+
end
291+
292+
Base.iterate(x::LineToItems) = iterate(x, 1)
293+
294+
function Base.iterate(x::LineToItems, i)
295+
if i > x.nfields
296+
return nothing
297+
end
298+
return x[i], i + 1
299+
end
300+
301+
# `parse_header` gets called _alot_ (on every line), so we try very hard to be
302+
# efficient.
303+
function parse_header(s::LineToItems)
304+
if length(s) == 1
305+
return parse_single_header(s[1])
306+
elseif length(s) == 2
307+
return parse_double_header(s[1])
308+
end
309+
return HEADER_UNKNOWN
221310
end
222311

223312
"""
@@ -237,13 +326,12 @@ function Base.read!(io::IO, model::Model{T}) where {T}
237326
if startswith(data.contents, '*')
238327
continue # Lines starting with `*` are comments
239328
end
240-
line = string(strip(data.contents))
241-
if isempty(line)
329+
items = LineToItems(data.contents)
330+
if length(items) == 0
242331
continue # Skip blank lines
243332
end
244-
h = Headers(line)
333+
h = parse_header(items)
245334
if h == HEADER_OBJSENSE
246-
items = line_to_items(line)
247335
if length(items) == 2
248336
sense = uppercase(items[2])
249337
if !(sense in ("MIN", "MAX"))
@@ -258,7 +346,6 @@ function Base.read!(io::IO, model::Model{T}) where {T}
258346
end
259347
continue
260348
elseif h == HEADER_QCMATRIX || h == HEADER_QSECTION
261-
items = line_to_items(line)
262349
if length(items) != 2
263350
_throw_parse_error(
264351
data,
@@ -274,10 +361,8 @@ function Base.read!(io::IO, model::Model{T}) where {T}
274361
continue
275362
end
276363
# Otherwise, carry on with the previous header
277-
# TODO: split into hard fields based on column indices.
278-
items = line_to_items(line)
279364
if header == HEADER_NAME
280-
parse_name_line(data, line)
365+
parse_name_line(data)
281366
elseif header == HEADER_OBJSENSE
282367
sense = uppercase(only(items))
283368
if !(sense in ("MIN", "MAX"))
@@ -490,8 +575,8 @@ end
490575
# NAME
491576
# ==============================================================================
492577

493-
function parse_name_line(data::TempMPSModel, line)
494-
m = match(r"^\s*NAME(.*)"i, line)
578+
function parse_name_line(data::TempMPSModel)
579+
m = match(r"^\s*NAME(.*)"i, data.contents)
495580
if m === nothing
496581
_throw_parse_error(
497582
data,
@@ -506,7 +591,7 @@ end
506591
# ROWS
507592
# ==============================================================================
508593

509-
function parse_rows_line(data::TempMPSModel{T}, items::Vector) where {T}
594+
function parse_rows_line(data::TempMPSModel{T}, items) where {T}
510595
if length(items) < 2
511596
_throw_parse_error(
512597
data,
@@ -619,7 +704,7 @@ function _set_intorg(data::TempMPSModel{T}, column, column_name) where {T}
619704
return
620705
end
621706

622-
function parse_columns_line(data::TempMPSModel{T}, items::Vector) where {T}
707+
function parse_columns_line(data::TempMPSModel{T}, items) where {T}
623708
if length(items) == 3
624709
# [column name] [row name] [value]
625710
column_name, row_name, value = items
@@ -657,7 +742,7 @@ end
657742
# RHS
658743
# ==============================================================================
659744

660-
function parse_single_rhs(data, row_name, value, items::Vector)
745+
function parse_single_rhs(data, row_name, value, items)
661746
if row_name == data.obj_name
662747
data.obj_constant = value
663748
return
@@ -688,7 +773,7 @@ function parse_single_rhs(data, row_name, value, items::Vector)
688773
end
689774

690775
# TODO: handle multiple RHS vectors.
691-
function parse_rhs_line(data::TempMPSModel{T}, items::Vector) where {T}
776+
function parse_rhs_line(data::TempMPSModel{T}, items) where {T}
692777
if length(items) == 3
693778
# [rhs name] [row name] [value]
694779
rhs_name, row_name, value = items
@@ -744,7 +829,7 @@ function parse_single_range(data, row_name, value)
744829
end
745830

746831
# TODO: handle multiple RANGES vectors.
747-
function parse_ranges_line(data::TempMPSModel{T}, items::Vector) where {T}
832+
function parse_ranges_line(data::TempMPSModel{T}, items) where {T}
748833
if length(items) == 3
749834
# [rhs name] [row name] [value]
750835
_, row_name, value = items
@@ -859,7 +944,7 @@ function _parse_single_bound(
859944
end
860945
end
861946

862-
function parse_bounds_line(data::TempMPSModel{T}, items::Vector) where {T}
947+
function parse_bounds_line(data::TempMPSModel{T}, items) where {T}
863948
if length(items) == 3
864949
bound_type, _, column_name = items
865950
_parse_single_bound(data, column_name, bound_type)

test/FileFormats/MPS/test_MPS.jl

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1126,11 +1126,12 @@ function test_parse_name_line()
11261126
" NAME foo" => "foo",
11271127
"" => nothing,
11281128
)
1129+
data.contents = line
11291130
data.name = "_"
11301131
if name === nothing
1131-
@test_throws MPS.ParseError MPS.parse_name_line(data, line)
1132+
@test_throws MPS.ParseError MPS.parse_name_line(data)
11321133
else
1133-
MPS.parse_name_line(data, line)
1134+
MPS.parse_name_line(data)
11341135
@test data.name == name
11351136
end
11361137
end
@@ -1702,12 +1703,12 @@ function test_issue_2792()
17021703
end
17031704

17041705
function test_issue_2797_tab()
1705-
@test MPS.line_to_items("a b") == ["a", "b"]
1706-
@test MPS.line_to_items(" a b") == ["a", "b"]
1707-
@test MPS.line_to_items("a\tb") == ["a", "b"]
1708-
@test MPS.line_to_items("a\tb") == ["a", "b"]
1709-
@test MPS.line_to_items("a\t b") == ["a", "b"]
1710-
@test MPS.line_to_items(" a \t b c ") == ["a", "b", "c"]
1706+
@test MPS.LineToItems("a b") |> collect == ["a", "b"]
1707+
@test MPS.LineToItems(" a b") |> collect == ["a", "b"]
1708+
@test MPS.LineToItems("a\tb") |> collect == ["a", "b"]
1709+
@test MPS.LineToItems("a\tb") |> collect == ["a", "b"]
1710+
@test MPS.LineToItems("a\t b") |> collect == ["a", "b"]
1711+
@test MPS.LineToItems(" a \t b c ") |> collect == ["a", "b", "c"]
17111712
return
17121713
end
17131714

0 commit comments

Comments
 (0)