-
Notifications
You must be signed in to change notification settings - Fork 4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reduce string allocation from TextFactory #14531
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,14 +24,15 @@ internal sealed class LargeText : SourceText | |
private readonly ImmutableArray<char[]> _chunks; | ||
private readonly int[] _chunkStartOffsets; | ||
private readonly int _length; | ||
private readonly Encoding _encoding; | ||
private readonly Encoding _encodingOpt; | ||
|
||
internal LargeText(ImmutableArray<char[]> chunks, Encoding encoding, ImmutableArray<byte> checksum, SourceHashAlgorithm checksumAlgorithm, ImmutableArray<byte> embeddedTextBlob) | ||
internal LargeText(ImmutableArray<char[]> chunks, Encoding encodingOpt, ImmutableArray<byte> checksum, SourceHashAlgorithm checksumAlgorithm, ImmutableArray<byte> embeddedTextBlob) | ||
: base(checksum, checksumAlgorithm, embeddedTextBlob) | ||
{ | ||
_chunks = chunks; | ||
_encoding = encoding; | ||
_encodingOpt = encodingOpt; | ||
_chunkStartOffsets = new int[chunks.Length]; | ||
|
||
int offset = 0; | ||
for (int i = 0; i < chunks.Length; i++) | ||
{ | ||
|
@@ -42,6 +43,11 @@ internal LargeText(ImmutableArray<char[]> chunks, Encoding encoding, ImmutableAr | |
_length = offset; | ||
} | ||
|
||
internal LargeText(ImmutableArray<char[]> chunks, Encoding encodingOpt, SourceHashAlgorithm checksumAlgorithm) | ||
: this(chunks, encodingOpt, default(ImmutableArray<byte>), checksumAlgorithm, default(ImmutableArray<byte>)) | ||
{ | ||
} | ||
|
||
internal static SourceText Decode(Stream stream, Encoding encoding, SourceHashAlgorithm checksumAlgorithm, bool throwIfBinaryDetected, bool canBeEmbedded) | ||
{ | ||
stream.Seek(0, SeekOrigin.Begin); | ||
|
@@ -58,48 +64,69 @@ internal static SourceText Decode(Stream stream, Encoding encoding, SourceHashAl | |
|
||
using (var reader = new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks: true, bufferSize: Math.Min(length, 4096), leaveOpen: true)) | ||
{ | ||
ArrayBuilder<char[]> chunks = ArrayBuilder<char[]>.GetInstance(1 + maxCharRemainingGuess / ChunkSize); | ||
while (!reader.EndOfStream) | ||
{ | ||
var nextChunkSize = ChunkSize; | ||
if (maxCharRemainingGuess < ChunkSize) | ||
{ | ||
// maxCharRemainingGuess typically overestimates a little | ||
// so we will first fill a slightly smaller (maxCharRemainingGuess - 64) chunk | ||
// and then use 64 char tail, which is likley to be resized. | ||
nextChunkSize = Math.Max(maxCharRemainingGuess - 64, 64); | ||
} | ||
var chunks = ReadChunksFromTextReader(reader, maxCharRemainingGuess, throwIfBinaryDetected); | ||
|
||
char[] chunk = new char[nextChunkSize]; | ||
// We must compute the checksum and embedded text blob now while we still have the original bytes in hand. | ||
// We cannot re-encode to obtain checksum and blob as the encoding is not guaranteed to round-trip. | ||
var checksum = CalculateChecksum(stream, checksumAlgorithm); | ||
var embeddedTextBlob = canBeEmbedded ? EmbeddedText.CreateBlob(stream) : default(ImmutableArray<byte>); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the 'embeddedTextBlob'? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nguerrera is probably better person to answer There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is the blob that encodes (optionally) embedded source in a PDB. See #12625 for design info. |
||
return new LargeText(chunks, reader.CurrentEncoding, checksum, checksumAlgorithm, embeddedTextBlob); | ||
} | ||
} | ||
|
||
int charsRead = reader.ReadBlock(chunk, 0, chunk.Length); | ||
if (charsRead == 0) | ||
{ | ||
break; | ||
} | ||
internal static SourceText Decode(TextReader reader, int length, Encoding encodingOpt, SourceHashAlgorithm checksumAlgorithm) | ||
{ | ||
if (length == 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you put a comment mentioning why this optimization is necessary? i.e. what percentage of readers have length=0 and how much this saves us to optimize. Thanks! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm surprised we have any callers calling LargeText.Decode witha length of 0. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dotnet/roslyn-compiler compiler team is probably the one should answer. |
||
{ | ||
return SourceText.From(string.Empty, encodingOpt, checksumAlgorithm); | ||
} | ||
|
||
maxCharRemainingGuess -= charsRead; | ||
// throwIfBinaryDetected == false since we are given text reader from the beginning | ||
var chunks = ReadChunksFromTextReader(reader, length, throwIfBinaryDetected: false); | ||
|
||
if (charsRead < chunk.Length) | ||
{ | ||
Array.Resize(ref chunk, charsRead); | ||
} | ||
return new LargeText(chunks, encodingOpt, checksumAlgorithm); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why 'LargeText'? 'Large' implies that it is used for, well, large inputs. But what if the length was only '1', would we still make a "Large" text? Or, if LargeText is for texts of any size, perhaps its name should be changed. Maybe ChunkedSourceText? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah,n/m. This is LargeText.Decode, not SourceText.Decode. So the lenght check was already done before. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am following existing decode pattern. @dotnet/roslyn-compiler probably better people to answer |
||
} | ||
|
||
// Check for binary files | ||
if (throwIfBinaryDetected && IsBinary(chunk)) | ||
{ | ||
throw new InvalidDataException(); | ||
} | ||
private static ImmutableArray<char[]> ReadChunksFromTextReader(TextReader reader, int maxCharRemainingGuess, bool throwIfBinaryDetected) | ||
{ | ||
var chunks = ArrayBuilder<char[]>.GetInstance(1 + maxCharRemainingGuess / ChunkSize); | ||
|
||
chunks.Add(chunk); | ||
while (reader.Peek() != -1) | ||
{ | ||
var nextChunkSize = ChunkSize; | ||
if (maxCharRemainingGuess < ChunkSize) | ||
{ | ||
// maxCharRemainingGuess typically overestimates a little | ||
// so we will first fill a slightly smaller (maxCharRemainingGuess - 64) chunk | ||
// and then use 64 char tail, which is likley to be resized. | ||
nextChunkSize = Math.Max(maxCharRemainingGuess - 64, 64); | ||
} | ||
|
||
// We must compute the checksum and embedded text blob now while we still have the original bytes in hand. | ||
// We cannot re-encode to obtain checksum and blob as the encoding is not guaranteed to round-trip. | ||
var checksum = CalculateChecksum(stream, checksumAlgorithm); | ||
var embeddedTextBlob = canBeEmbedded ? EmbeddedText.CreateBlob(stream) : default(ImmutableArray<byte>); | ||
return new LargeText(chunks.ToImmutableAndFree(), reader.CurrentEncoding, checksum, checksumAlgorithm, embeddedTextBlob); | ||
char[] chunk = new char[nextChunkSize]; | ||
|
||
int charsRead = reader.ReadBlock(chunk, 0, chunk.Length); | ||
if (charsRead == 0) | ||
{ | ||
break; | ||
} | ||
|
||
maxCharRemainingGuess -= charsRead; | ||
|
||
if (charsRead < chunk.Length) | ||
{ | ||
Array.Resize(ref chunk, charsRead); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this resizing will not actually change 'chunk', it will allocate another array, copy the data inot it, then point 'ref chunk' at that new array. Given that, do we want to pool chunks so that we don't leak them in the case where we need to resize? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't believe there is any pool here for chunk. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could add one :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if there is data that shows it help. but I doubt that will help since the chunk will live as long as source text live and the pool will statically alive in memory and the size should be big assuming it is for large text. |
||
} | ||
|
||
// Check for binary files | ||
if (throwIfBinaryDetected && IsBinary(chunk)) | ||
{ | ||
throw new InvalidDataException(); | ||
} | ||
|
||
chunks.Add(chunk); | ||
} | ||
|
||
return chunks.ToImmutableAndFree(); | ||
} | ||
|
||
/// <summary> | ||
|
@@ -146,7 +173,7 @@ public override char this[int position] | |
} | ||
} | ||
|
||
public override Encoding Encoding => _encoding; | ||
public override Encoding Encoding => _encodingOpt; | ||
|
||
public override int Length => _length; | ||
|
||
|
@@ -272,7 +299,7 @@ private int[] ParseLineStarts() | |
case '\u0085': | ||
case '\u2028': | ||
case '\u2029': | ||
line_break: | ||
line_break: | ||
arrayBuilder.Add(position); | ||
position = index; | ||
break; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CC @AnthonyDGreen, @jaredpar, @gafter For new public API.