Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions Lib/test/test_json/test_speedups.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,60 @@ def test(name):
def test_unsortable_keys(self):
with self.assertRaises(TypeError):
self.json.encoder.JSONEncoder(sort_keys=True).encode({'a': 1, 1: 'a'})

def test_indent_argument_to_encoder(self):
# gh-143196: indent must be str, int, or None
# int is converted to spaces
enc = self.json.encoder.c_make_encoder(
None, lambda obj: obj, lambda obj: obj,
4, ':', ', ', False, False, False,
)
result = enc({'a': 1}, 0)
self.assertIn(' ', result[0]) # 4 spaces

# Negative int should raise ValueError
with self.assertRaisesRegex(
ValueError,
r'make_encoder\(\) argument 4 must be a non-negative int',
):
self.json.encoder.c_make_encoder(None, None, None, -1, ': ', ', ',
False, False, False)

# Other types should raise TypeError
with self.assertRaisesRegex(
TypeError,
r'make_encoder\(\) argument 4 must be str, int, or None, not list',
):
self.json.encoder.c_make_encoder(None, None, None, [' '],
': ', ', ', False, False, False)

def test_nonzero_indent_level_with_indent(self):
# gh-143196: _current_indent_level must be 0 when indent is set
# This prevents heap-buffer-overflow from uninitialized cache access
# and also prevents re-entrant __mul__ attacks since PySequence_Repeat
# is only called when indent_level != 0
enc = self.json.encoder.c_make_encoder(
None, lambda obj: obj, lambda obj: obj,
' ', ':', ', ', False, False, False,
)
# indent_level=0 should work
enc([None], 0)
# indent_level!=0 should raise ValueError
with self.assertRaisesRegex(
ValueError,
r'_current_indent_level must be 0 when indent is set',
):
enc([None], 1)

# Verify that str subclasses with custom __mul__ are safe because
# __mul__ is never called when indent_level=0
class CustomIndent(str):
def __mul__(self, count):
raise RuntimeError("__mul__ should not be called")

enc2 = self.json.encoder.c_make_encoder(
None, lambda obj: obj, lambda obj: obj,
CustomIndent(' '), ':', ', ', False, False, False,
)
# This should work - __mul__ is not called when indent_level=0
enc2({'a': 1}, 0)
43 changes: 41 additions & 2 deletions Modules/_json.c
Original file line number Diff line number Diff line change
Expand Up @@ -1319,14 +1319,47 @@ encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
return NULL;
}

/* Convert indent to str if it's not None or already a string */
if (indent != Py_None && !PyUnicode_Check(indent)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of this complex checks (argument 1 also is checked) I'd suggest that we convert __new__ to argument clinic. However, I'm not entirely sure that we won't lose perfs. How many times do we call this function?

Copy link
Author

@caverac caverac Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Performance considerations

For context, my original proposal was to use a simpler change

static PyObject *
encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
{
    ...
    if (indent != Py_None && !PyUnicode_Check(indent)) {
        PyErr_Format(PyExc_TypeError,
                     "make_encoder() argument 4 must be str or None, "
                     "not %.200s", Py_TYPE(indent)->tp_name);
        return NULL;
    }
    ...
}

Let me call that fix-v1.

After comment/suggestion in the reported issue I implemented the unidecode check with the additional logical statements (that is the current branch), let me call that fix.

And finally we have main. What I did was run some tests against all these three branches. Basically ran a warm-up of the following snippet and then 100 times to measure average executing time

start = time.perf_counter()
json.dumps(data, indent=2)
end = time.perf_counter()

These are the results

Test fix fix-v1 main
128KB.json 1.97 ms 2.46 ms 1.95 ms
256KB.json 2.04 ms 2.03 ms 2.03 ms
512KB.json 4.04 ms 4.05 ms 4.07 ms
1MB.json 8.83 ms 8.00 ms 8.14 ms
5MB.json 42.02 ms 41.48 ms 42.02 ms

My local is a bit outdated, and I'm pretty confident these numbers can reduced in a faster cpu, but I do believe that performance is not impacted, or at least my changes are statistically consistent with main

Using the argument clinic

I'm rabbit-holing a bit into that approach, not entirely sure yet how it works. But my take on it: it would require some major changes to the module, that I don't feel comfortable messing with. If that is the desired path, I will be happy to document my findings in the reported issue, and leave it to more expert/capable hands to complete.

"How many times do we call this function?"

I am not entirely sure what do you mean with this question, sorry 😢 encoder_new is called once per JSONEncoder instance creation. In typical usage (json.dumps()),

json.dumps(data, indent=2)
    |-- JSONEncoder.encode()
          |-- JSONEncoder.iterencode()
                |-- c_make_encoder()
                      |-- encoder_new()  <= called only once
                |-- _iterencode(obj, 0)
                      |-- encoder_listencode_obj()
                            |-- encoder_listencode_list()
                            |-- encoder_listencode_dict()

it's called once per encoding operation. "We" don't call it, the user does and, as I showed above, they are not going to experience any measurable changes in performance. My apologies again if I misunderstood your question

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "we" was a generic "we" (academical one). But you answered my question well. I wondered whether c_make_encoder was called multiple times when doing a single json.dumps().

Conversion to AC may introduce an overhead in how the arguments are pre-processed but if you're not comfortable with this, we can first fix this (and backport it) and then switch to AC for main.

Py_ssize_t indent_level;
if (!PyIndex_Check(indent)) {
PyErr_Format(PyExc_TypeError,
"make_encoder() argument 4 must be str, int, or None, "
"not %.200s", Py_TYPE(indent)->tp_name);
return NULL;
}
indent_level = PyNumber_AsSsize_t(indent, PyExc_ValueError);
if (indent_level == -1 && PyErr_Occurred()) {
return NULL;
}
if (indent_level < 0) {
PyErr_SetString(PyExc_ValueError,
"make_encoder() argument 4 must be a non-negative int");
return NULL;
}
/* Create a string of spaces: ' ' * indent_level */
indent = PyUnicode_New(indent_level, ' ');
if (indent == NULL) {
return NULL;
}
if (indent_level > 0) {
memset(PyUnicode_1BYTE_DATA(indent), ' ', indent_level);
}
Comment on lines +1345 to +1347
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (indent_level > 0) {
memset(PyUnicode_1BYTE_DATA(indent), ' ', indent_level);
}
memset(PyUnicode_1BYTE_DATA(indent), ' ', sizeof(Py_UCS1) * indent_level);

memset allows for a 0 size (and does nothing) as long as the pointer is valid. Alternatively, we could use PyUnicode_Fill but I think we usually bypass this and use memset directly.

}
else {
Py_INCREF(indent);
}

s = (PyEncoderObject *)type->tp_alloc(type, 0);
if (s == NULL)
if (s == NULL) {
Py_DECREF(indent);
return NULL;
}

s->markers = Py_NewRef(markers);
s->defaultfn = Py_NewRef(defaultfn);
s->encoder = Py_NewRef(encoder);
s->indent = Py_NewRef(indent);
s->indent = indent; /* Already incref'd or newly created */
s->key_separator = Py_NewRef(key_separator);
s->item_separator = Py_NewRef(item_separator);
s->sort_keys = sort_keys;
Expand Down Expand Up @@ -1453,6 +1486,12 @@ encoder_call(PyObject *op, PyObject *args, PyObject *kwds)

PyObject *indent_cache = NULL;
if (self->indent != Py_None) {
if (indent_level != 0) {
PyErr_SetString(PyExc_ValueError,
"_current_indent_level must be 0 when indent is set");
PyUnicodeWriter_Discard(writer);
return NULL;
}
indent_cache = create_indent_cache(self, indent_level);
if (indent_cache == NULL) {
PyUnicodeWriter_Discard(writer);
Expand Down
Loading