Skip to content

Commit

Permalink
Improve performance (#38)
Browse files Browse the repository at this point in the history
Add optimized `readbytes!` method which copies data by chunks instead of byte per byte.
Internally, avoid passing a `SubArray` to `readbytes!` as there is currently no optimized
method for them: instead, wrap the corresponding memory in an `Array`.
Increase the size of the buffer from 100 to 200 bytes, which appears to be a good tradeoff.
This makes loading a file about 10 times faster than before.
  • Loading branch information
nalimilan committed Aug 4, 2020
1 parent c9c34ae commit f658892
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 7 deletions.
36 changes: 32 additions & 4 deletions src/StringEncodings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ end
using Base.Libc: errno, strerror, E2BIG, EINVAL, EILSEQ

import Base: close, eachline, eof, flush, isreadable, iswritable,
open, readline, readlines, readuntil, show, write, read
open, read, readbytes!, readline, readlines, readuntil, show, write

export StringEncoder, StringDecoder, encode, decode, encodings
export StringEncodingError, OutputBufferError, IConvError
Expand Down Expand Up @@ -94,7 +94,7 @@ end

## StringEncoder and StringDecoder common functions

const BUFSIZE = 100
const BUFSIZE = 200

mutable struct StringEncoder{F<:Encoding, T<:Encoding, S<:IO} <: IO
stream::S
Expand Down Expand Up @@ -318,7 +318,13 @@ function fill_buffer!(s::StringDecoder)
return i
end

s.inbytesleft[] += readbytes!(s.stream, view(s.inbuf, Int(s.inbytesleft[]+1):BUFSIZE))
# readbytes! performance with SubArray was improved by JuliaLang/julia#36607
@static if VERSION >= v"1.6.0-DEV.438"
inbuf_view = view(s.inbuf, Int(s.inbytesleft[]+1):BUFSIZE)
else
inbuf_view = unsafe_wrap(Array, pointer(s.inbuf, s.inbytesleft[]+1), BUFSIZE)
end
s.inbytesleft[] += readbytes!(s.stream, inbuf_view)
iconv!(s.cd, s.inbuf, s.outbuf, s.inbufptr, s.outbufptr, s.inbytesleft, s.outbytesleft)
end

Expand All @@ -328,7 +334,7 @@ end
# data contains only state control sequences which may be converted to nothing)
# 3) if not, reset iconv to initial state, which may generate data
function eof(s::StringDecoder)
length(s.outbuf) - s.outbytesleft[] == s.skip &&
BUFSIZE - s.outbytesleft[] == s.skip &&
fill_buffer!(s) == 0 &&
iconv_reset!(s) == 0
end
Expand Down Expand Up @@ -403,6 +409,28 @@ function open(fname::AbstractString, enc::Encoding, mode::AbstractString)
wrap_stream(open(fname, mode), enc)
end

# optimized method adapted from Base but reading as many bytes
# as the buffer contains on each iteration rather than a single one,
# which increases performance dramatically
function readbytes!(s::StringDecoder, b::AbstractArray{UInt8}, nb=length(b))
olb = lb = length(b)
nr = 0
while nr < nb && !eof(s)
nc = min(nb-nr, BUFSIZE - s.outbytesleft[])
if nr+nc > lb
lb = (nr+nc) * 2
resize!(b, lb)
end
copyto!(b, firstindex(b)+nr, s.outbuf, s.skip+1, nc)
s.skip += nc
nr += nc
end
if lb > olb
resize!(b, nr) # shrink to just contain input data if was resized
end
return nr
end

"""
read(stream::IO, [nb::Integer,] enc::Encoding)
read(filename::AbstractString, [nb::Integer,] enc::Encoding)
Expand Down
42 changes: 39 additions & 3 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ mktemp() do path, io
write(io, s)
end

nc = ncodeunits(first(s, 90))

@test String(read(path, enc"ISO-2022-JP")) == s
@test String(open(io->read(io, enc"ISO-2022-JP"), path)) == s
@test String(open(io->read(io), path, enc"ISO-2022-JP")) == s
Expand All @@ -149,14 +151,48 @@ mktemp() do path, io
@test String(open(io->read(io, 1000, enc"ISO-2022-JP"), path)) == s
@test String(open(io->read(io, 1000), path, enc"ISO-2022-JP")) == s

@test String(read(path, 10, enc"ISO-2022-JP")) == first(s, 10)
@test String(open(io->read(io, 10, enc"ISO-2022-JP"), path)) == first(s, 10)
@test String(open(io->read(io, 10), path, enc"ISO-2022-JP")) == first(s, 10)
@test String(read(path, nc, enc"ISO-2022-JP")) == first(s, nc)
@test String(open(io->read(io, nc, enc"ISO-2022-JP"), path)) == first(s, nc)
@test String(open(io->read(io, nc), path, enc"ISO-2022-JP")) == first(s, nc)

@test read(path, String, enc"ISO-2022-JP") == s
@test open(io->read(io, String, enc"ISO-2022-JP"), path) == s
@test open(io->read(io, String), path, enc"ISO-2022-JP") == s

b = zeros(UInt8, nc)
@test open(io->read!(io, b), path, enc"ISO-2022-JP") === b
@test String(b) == first(s, 90)

b = zeros(UInt8, nc)
@test open(io->readbytes!(io, b), path, enc"ISO-2022-JP") == ncodeunits(s)
@test String(b) == s

b = zeros(UInt8, 1000)
@test open(io->readbytes!(io, b), path, enc"ISO-2022-JP") == ncodeunits(s)
@test length(b) == 1000
@test String(b[1:ncodeunits(s)]) == s

b = UInt8[]
@test open(io->readbytes!(io, b), path, enc"ISO-2022-JP") == 0
@test length(b) == 0

b = zeros(UInt8, nc)
@test open(io->readbytes!(io, b, nc), path, enc"ISO-2022-JP") == nc
@test String(b) == first(s, 90)

b = zeros(UInt8, 1000)
@test open(io->readbytes!(io, b, nc), path, enc"ISO-2022-JP") == nc
@test length(b) == 1000
@test String(b[1:nc]) == first(s, 90)

b = UInt8[]
@test open(io->readbytes!(io, b, nc), path, enc"ISO-2022-JP") == nc
@test String(b) == first(s, 90)

b = UInt8[]
open(io->while !eof(io); push!(b, read(io, UInt8)) end, path, enc"ISO-2022-JP")
@test String(b) == s

@test readuntil(path, enc"ISO-2022-JP", '\0') == "a string "
@test open(io->readuntil(io, enc"ISO-2022-JP", '\0'), path) == "a string "
@test open(io->readuntil(io, enc"ISO-2022-JP", '\0', keep=true), path) == "a string \0"
Expand Down

2 comments on commit f658892

@nalimilan
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/18971

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.2 -m "<description of version>" f658892195e7d038f36bf33203d6eb026dd309b1
git push origin v0.3.2

Please sign in to comment.