structstd.process.ArgIteratorWindows[src]

Iterator that implements the Windows command-line parsing algorithm. The implementation is intended to be compatible with the post-2008 C runtime, but is not intended to be compatible with CommandLineToArgvW since CommandLineToArgvW uses the pre-2008 parsing rules.

This iterator faithfully implements the parsing behavior observed from the C runtime with one exception: if the command-line string is empty, the iterator will immediately complete without returning any arguments (whereas the C runtime will return a single argument representing the name of the current executable).

The essential parts of the algorithm are described in Microsoft's documentation:

David Deley explains some additional undocumented quirks in great detail:

Fields

allocator: Allocator
cmd_line: []const u16

Encoded as WTF-16 LE.

index: usize = 0
buffer: []u8

Owned by the iterator. Long enough to hold contiguous NUL-terminated slices of each argument encoded as WTF-8.

start: usize = 0
end: usize = 0

Error Sets

Error SetInitError[src]

Errors

anyerror means the error set is known only at runtime.

OutOfMemory

Source Code

Source code
pub const InitError = error{OutOfMemory}

Functions

Functioninit[src]

pub fn init(allocator: Allocator, cmd_line_w: []const u16) InitError!ArgIteratorWindows

cmd_line_w must be a WTF16-LE-encoded string.

The iterator stores and uses cmd_line_w, so its memory must be valid for at least as long as the returned ArgIteratorWindows.

Parameters

allocator: Allocator
cmd_line_w: []const u16

Source Code

Source code
pub fn init(allocator: Allocator, cmd_line_w: []const u16) InitError!ArgIteratorWindows {
    const wtf8_len = unicode.calcWtf8Len(cmd_line_w);

    // This buffer must be large enough to contain contiguous NUL-terminated slices
    // of each argument.
    // - During parsing, the length of a parsed argument will always be equal to
    //   to less than its unparsed length
    // - The first argument needs one extra byte of space allocated for its NUL
    //   terminator, but for each subsequent argument the necessary whitespace
    //   between arguments guarantees room for their NUL terminator(s).
    const buffer = try allocator.alloc(u8, wtf8_len + 1);
    errdefer allocator.free(buffer);

    return .{
        .allocator = allocator,
        .cmd_line = cmd_line_w,
        .buffer = buffer,
    };
}

Functionnext[src]

pub fn next(self: *ArgIteratorWindows) ?[:0]const u8

Returns the next argument and advances the iterator. Returns null if at the end of the command-line string. The iterator owns the returned slice. The result is encoded as WTF-8.

Parameters

Source Code

Source code
pub fn next(self: *ArgIteratorWindows) ?[:0]const u8 {
    return self.nextWithStrategy(next_strategy);
}

Functionskip[src]

pub fn skip(self: *ArgIteratorWindows) bool

Skips the next argument and advances the iterator. Returns true if an argument was skipped, false if at the end of the command-line string.

Parameters

Source Code

Source code
pub fn skip(self: *ArgIteratorWindows) bool {
    return self.nextWithStrategy(skip_strategy);
}

Functiondeinit[src]

pub fn deinit(self: *ArgIteratorWindows) void

Frees the iterator's copy of the command-line string and all previously returned argument slices.

Parameters

Source Code

Source code
pub fn deinit(self: *ArgIteratorWindows) void {
    self.allocator.free(self.buffer);
}

Source Code

Source code
pub const ArgIteratorWindows = struct {
    allocator: Allocator,
    /// Encoded as WTF-16 LE.
    cmd_line: []const u16,
    index: usize = 0,
    /// Owned by the iterator. Long enough to hold contiguous NUL-terminated slices
    /// of each argument encoded as WTF-8.
    buffer: []u8,
    start: usize = 0,
    end: usize = 0,

    pub const InitError = error{OutOfMemory};

    /// `cmd_line_w` *must* be a WTF16-LE-encoded string.
    ///
    /// The iterator stores and uses `cmd_line_w`, so its memory must be valid for
    /// at least as long as the returned ArgIteratorWindows.
    pub fn init(allocator: Allocator, cmd_line_w: []const u16) InitError!ArgIteratorWindows {
        const wtf8_len = unicode.calcWtf8Len(cmd_line_w);

        // This buffer must be large enough to contain contiguous NUL-terminated slices
        // of each argument.
        // - During parsing, the length of a parsed argument will always be equal to
        //   to less than its unparsed length
        // - The first argument needs one extra byte of space allocated for its NUL
        //   terminator, but for each subsequent argument the necessary whitespace
        //   between arguments guarantees room for their NUL terminator(s).
        const buffer = try allocator.alloc(u8, wtf8_len + 1);
        errdefer allocator.free(buffer);

        return .{
            .allocator = allocator,
            .cmd_line = cmd_line_w,
            .buffer = buffer,
        };
    }

    /// Returns the next argument and advances the iterator. Returns `null` if at the end of the
    /// command-line string. The iterator owns the returned slice.
    /// The result is encoded as [WTF-8](https://simonsapin.github.io/wtf-8/).
    pub fn next(self: *ArgIteratorWindows) ?[:0]const u8 {
        return self.nextWithStrategy(next_strategy);
    }

    /// Skips the next argument and advances the iterator. Returns `true` if an argument was
    /// skipped, `false` if at the end of the command-line string.
    pub fn skip(self: *ArgIteratorWindows) bool {
        return self.nextWithStrategy(skip_strategy);
    }

    const next_strategy = struct {
        const T = ?[:0]const u8;

        const eof = null;

        /// Returns '\' if any backslashes are emitted, otherwise returns `last_emitted_code_unit`.
        fn emitBackslashes(self: *ArgIteratorWindows, count: usize, last_emitted_code_unit: ?u16) ?u16 {
            for (0..count) |_| {
                self.buffer[self.end] = '\\';
                self.end += 1;
            }
            return if (count != 0) '\\' else last_emitted_code_unit;
        }

        /// If `last_emitted_code_unit` and `code_unit` form a surrogate pair, then
        /// the previously emitted high surrogate is overwritten by the codepoint encoded
        /// by the surrogate pair, and `null` is returned.
        /// Otherwise, `code_unit` is emitted and returned.
        fn emitCharacter(self: *ArgIteratorWindows, code_unit: u16, last_emitted_code_unit: ?u16) ?u16 {
            // Because we are emitting WTF-8, we need to
            // check to see if we've emitted two consecutive surrogate
            // codepoints that form a valid surrogate pair in order
            // to ensure that we're always emitting well-formed WTF-8
            // (https://simonsapin.github.io/wtf-8/#concatenating).
            //
            // If we do have a valid surrogate pair, we need to emit
            // the UTF-8 sequence for the codepoint that they encode
            // instead of the WTF-8 encoding for the two surrogate pairs
            // separately.
            //
            // This is relevant when dealing with a WTF-16 encoded
            // command line like this:
            // "<0xD801>"<0xDC37>
            // which would get parsed and converted to WTF-8 as:
            // <0xED><0xA0><0x81><0xED><0xB0><0xB7>
            // but instead, we need to recognize the surrogate pair
            // and emit the codepoint it encodes, which in this
            // example is U+10437 (𐐷), which is encoded in UTF-8 as:
            // <0xF0><0x90><0x90><0xB7>
            if (last_emitted_code_unit != null and
                std.unicode.utf16IsLowSurrogate(code_unit) and
                std.unicode.utf16IsHighSurrogate(last_emitted_code_unit.?))
            {
                const codepoint = std.unicode.utf16DecodeSurrogatePair(&.{ last_emitted_code_unit.?, code_unit }) catch unreachable;

                // Unpaired surrogate is 3 bytes long
                const dest = self.buffer[self.end - 3 ..];
                const len = unicode.utf8Encode(codepoint, dest) catch unreachable;
                // All codepoints that require a surrogate pair (> U+FFFF) are encoded as 4 bytes
                assert(len == 4);
                self.end += 1;
                return null;
            }

            const wtf8_len = std.unicode.wtf8Encode(code_unit, self.buffer[self.end..]) catch unreachable;
            self.end += wtf8_len;
            return code_unit;
        }

        fn yieldArg(self: *ArgIteratorWindows) [:0]const u8 {
            self.buffer[self.end] = 0;
            const arg = self.buffer[self.start..self.end :0];
            self.end += 1;
            self.start = self.end;
            return arg;
        }
    };

    const skip_strategy = struct {
        const T = bool;

        const eof = false;

        fn emitBackslashes(_: *ArgIteratorWindows, _: usize, last_emitted_code_unit: ?u16) ?u16 {
            return last_emitted_code_unit;
        }

        fn emitCharacter(_: *ArgIteratorWindows, _: u16, last_emitted_code_unit: ?u16) ?u16 {
            return last_emitted_code_unit;
        }

        fn yieldArg(_: *ArgIteratorWindows) bool {
            return true;
        }
    };

    fn nextWithStrategy(self: *ArgIteratorWindows, comptime strategy: type) strategy.T {
        var last_emitted_code_unit: ?u16 = null;
        // The first argument (the executable name) uses different parsing rules.
        if (self.index == 0) {
            if (self.cmd_line.len == 0 or self.cmd_line[0] == 0) {
                // Immediately complete the iterator.
                // The C runtime would return the name of the current executable here.
                return strategy.eof;
            }

            var inside_quotes = false;
            while (true) : (self.index += 1) {
                const char = if (self.index != self.cmd_line.len)
                    mem.littleToNative(u16, self.cmd_line[self.index])
                else
                    0;
                switch (char) {
                    0 => {
                        return strategy.yieldArg(self);
                    },
                    '"' => {
                        inside_quotes = !inside_quotes;
                    },
                    ' ', '\t' => {
                        if (inside_quotes) {
                            last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
                        } else {
                            self.index += 1;
                            return strategy.yieldArg(self);
                        }
                    },
                    else => {
                        last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
                    },
                }
            }
        }

        // Skip spaces and tabs. The iterator completes if we reach the end of the string here.
        while (true) : (self.index += 1) {
            const char = if (self.index != self.cmd_line.len)
                mem.littleToNative(u16, self.cmd_line[self.index])
            else
                0;
            switch (char) {
                0 => return strategy.eof,
                ' ', '\t' => continue,
                else => break,
            }
        }

        // Parsing rules for subsequent arguments:
        //
        // - The end of the string always terminates the current argument.
        // - When not in 'inside_quotes' mode, a space or tab terminates the current argument.
        // - 2n backslashes followed by a quote emit n backslashes (note: n can be zero).
        //   If in 'inside_quotes' and the quote is immediately followed by a second quote,
        //   one quote is emitted and the other is skipped, otherwise, the quote is skipped
        //   and 'inside_quotes' is toggled.
        // - 2n + 1 backslashes followed by a quote emit n backslashes followed by a quote.
        // - n backslashes not followed by a quote emit n backslashes.
        var backslash_count: usize = 0;
        var inside_quotes = false;
        while (true) : (self.index += 1) {
            const char = if (self.index != self.cmd_line.len)
                mem.littleToNative(u16, self.cmd_line[self.index])
            else
                0;
            switch (char) {
                0 => {
                    last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
                    return strategy.yieldArg(self);
                },
                ' ', '\t' => {
                    last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
                    backslash_count = 0;
                    if (inside_quotes) {
                        last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
                    } else return strategy.yieldArg(self);
                },
                '"' => {
                    const char_is_escaped_quote = backslash_count % 2 != 0;
                    last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count / 2, last_emitted_code_unit);
                    backslash_count = 0;
                    if (char_is_escaped_quote) {
                        last_emitted_code_unit = strategy.emitCharacter(self, '"', last_emitted_code_unit);
                    } else {
                        if (inside_quotes and
                            self.index + 1 != self.cmd_line.len and
                            mem.littleToNative(u16, self.cmd_line[self.index + 1]) == '"')
                        {
                            last_emitted_code_unit = strategy.emitCharacter(self, '"', last_emitted_code_unit);
                            self.index += 1;
                        } else {
                            inside_quotes = !inside_quotes;
                        }
                    }
                },
                '\\' => {
                    backslash_count += 1;
                },
                else => {
                    last_emitted_code_unit = strategy.emitBackslashes(self, backslash_count, last_emitted_code_unit);
                    backslash_count = 0;
                    last_emitted_code_unit = strategy.emitCharacter(self, char, last_emitted_code_unit);
                },
            }
        }
    }

    /// Frees the iterator's copy of the command-line string and all previously returned
    /// argument slices.
    pub fn deinit(self: *ArgIteratorWindows) void {
        self.allocator.free(self.buffer);
    }
}