Compare commits
8 Commits
e8d2855b36
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 995ddf329f | |||
| 5fc823b392 | |||
| 490b49e55d | |||
| 6fc263074e | |||
| befb464619 | |||
| f2bb72b3dc | |||
| 81814fa590 | |||
| 611bccfb9b |
33
README.md
33
README.md
@@ -1,4 +1,4 @@
|
|||||||
# WeaselJSON: A Streaming JSON Parser Review
|
# WeaselJSON: A Streaming JSON Parser
|
||||||
|
|
||||||
## What is WeaselJSON?
|
## What is WeaselJSON?
|
||||||
|
|
||||||
@@ -17,28 +17,20 @@ WeaselJSON is a high-performance, streaming JSON parser that uses callbacks inst
|
|||||||
- The callback API is a pain to use correctly
|
- The callback API is a pain to use correctly
|
||||||
- Requires a lot of boilerplate for simple tasks
|
- Requires a lot of boilerplate for simple tasks
|
||||||
- Most people don't actually need streaming JSON parsing
|
- Most people don't actually need streaming JSON parsing
|
||||||
- Some features only work if you control how the JSON is structured
|
|
||||||
|
|
||||||
## JSON Parser Decision Guide
|
## JSON Parser Decision Guide
|
||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
flowchart TD
|
flowchart TD
|
||||||
A[Need to parse JSON?] --> B{Do you have memory constraints<br/>or truly streaming data?}
|
A[Need to parse JSON?] --> B{Memory or size constraints?}
|
||||||
|
|
||||||
B -->|No| C{Is performance critical<br/>and data fits in memory?}
|
B -->|Yes| C[WeaselJSON<br/>Streaming parser]
|
||||||
B -->|Yes| D{Can you control<br/>how the JSON is laid out?}
|
|
||||||
|
|
||||||
C -->|No| E[SimdJSON DOM API<br/>Fast and easy to use<br/>Wait why are you using C++?]
|
B -->|No| D{Need maximum speed?}
|
||||||
C -->|Yes| F{Are you OK with potential<br/>performance traps?}
|
|
||||||
|
|
||||||
F -->|Yes| G[SimdJSON On-Demand<br/>Very fast<br/>Nice API<br/>Easy to use wrong]
|
D -->|No| E[SimdJSON DOM<br/>Easy to use]
|
||||||
F -->|No| H[Consider weaseljson<br/>if you can deal with callbacks]
|
|
||||||
|
|
||||||
D -->|No| I{Can you preprocess data<br/>or make multiple requests?}
|
D -->|Yes| F[SimdJSON On-Demand<br/>Fastest option]
|
||||||
D -->|Yes| J[WeaselJSON<br/>Streaming performance<br/>Constant memory usage<br/>Harder to use]
|
|
||||||
|
|
||||||
I -->|No| K[Use SimdJSON DOM<br/>Deal with the tradeoffs]
|
|
||||||
I -->|Yes| L[WeaselJSON with<br/>data preprocessing]
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## When to Use What
|
## When to Use What
|
||||||
@@ -47,18 +39,19 @@ flowchart TD
|
|||||||
- You want to write `obj["key"]` and have it just work
|
- You want to write `obj["key"]` and have it just work
|
||||||
- JSON files are reasonably sized (but you still want decent performance)
|
- JSON files are reasonably sized (but you still want decent performance)
|
||||||
- You care about getting things done
|
- You care about getting things done
|
||||||
|
- You need full document validation upfront
|
||||||
|
|
||||||
### Use **SimdJSON On-Demand** when:
|
### Use **SimdJSON On-Demand** when:
|
||||||
- Performance is critical and your data fits in memory
|
- Performance is critical and your data fits in memory
|
||||||
- You understand that some access patterns can accidentally become very slow
|
- You can work with forward-only traversal (no random access or backtracking)
|
||||||
- You need maximum speed but still want a usable API
|
- You need maximum speed but still want a usable API
|
||||||
|
- You're okay with partial validation (only validates parts you actually access)
|
||||||
|
- Your JSON keys don't contain escape sequences (OnDemand matches raw keys without unescaping)
|
||||||
|
|
||||||
### Use **WeaselJSON** when:
|
### Use **WeaselJSON** when:
|
||||||
- You absolutely cannot load the whole JSON into memory
|
|
||||||
- You're processing huge JSON files (multiple gigabytes)
|
|
||||||
- You want to parse just part of a JSON file without reading the rest
|
- You want to parse just part of a JSON file without reading the rest
|
||||||
- You need to convert JSON into some other format as you parse it
|
- You need to convert JSON into some other format as you parse it
|
||||||
- You can't risk accidentally slow performance
|
- You need predictable performance characteristics
|
||||||
- Writing stateful callbacks is your idea of fun
|
- Writing stateful callbacks is your idea of fun
|
||||||
|
|
||||||
## The Reality
|
## The Reality
|
||||||
@@ -67,8 +60,6 @@ WeaselJSON solves real problems that other parsers can't handle, but most people
|
|||||||
|
|
||||||
The parser represents excellent engineering work and occupies a useful niche. It's the kind of tool you're very glad exists when you actually need it, but most people will never need it.
|
The parser represents excellent engineering work and occupies a useful niche. It's the kind of tool you're very glad exists when you actually need it, but most people will never need it.
|
||||||
|
|
||||||
**Note**: WeaselJSON assumes modern CPU features (SIMD), so you probably can't use it for embedded development.
|
|
||||||
|
|
||||||
## Technical Reference
|
## Technical Reference
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
@@ -76,7 +67,7 @@ The parser represents excellent engineering work and occupies a useful niche. It
|
|||||||
- No memory allocations during parsing
|
- No memory allocations during parsing
|
||||||
- O(1) memory usage regardless of input size
|
- O(1) memory usage regardless of input size
|
||||||
- Streaming API - no need to buffer the entire document in memory. Parsing is resumed when more data is available
|
- Streaming API - no need to buffer the entire document in memory. Parsing is resumed when more data is available
|
||||||
- Strings are unescaped in place before they're presented. No unicode normalization is performed
|
- By default, strings are unescaped in place before they're presented (modifies your input buffer). No unicode normalization is performed
|
||||||
- Robust to crashes with untrusted input
|
- Robust to crashes with untrusted input
|
||||||
- SIMD optimizations for string scanning and validation
|
- SIMD optimizations for string scanning and validation
|
||||||
|
|
||||||
|
|||||||
@@ -214,7 +214,7 @@ inline PRESERVE_NONE WeaselJsonStatus scan_string_impl(Parser3 *self,
|
|||||||
}
|
}
|
||||||
auto v = V{(int8_t *)buf};
|
auto v = V{(int8_t *)buf};
|
||||||
int normal =
|
int normal =
|
||||||
(v != V::splat('"') & v != V::splat('\\') & v >= V::splat(0x20))
|
((v != V::splat('"')) & (v != V::splat('\\')) & (v >= V::splat(0x20)))
|
||||||
.count_leading_nonzero_lanes();
|
.count_leading_nonzero_lanes();
|
||||||
buf += normal;
|
buf += normal;
|
||||||
if (normal < V::lanes) {
|
if (normal < V::lanes) {
|
||||||
|
|||||||
@@ -558,7 +558,7 @@ template <std::integral T, int kLanes> struct simd<T, kLanes, Simd_x86_SSE> {
|
|||||||
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
|
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
|
||||||
__m128i v0;
|
__m128i v0;
|
||||||
memcpy(&v0, &x[i], 16);
|
memcpy(&v0, &x[i], 16);
|
||||||
v0 = _mm_xor_si128(v0, _mm_set1_epi8(0xff));
|
v0 = _mm_xor_si128(v0, _mm_set1_epi8((char)0xff));
|
||||||
memcpy(&result.x[i], &v0, 16);
|
memcpy(&result.x[i], &v0, 16);
|
||||||
}
|
}
|
||||||
for (; i < kLanes; ++i) {
|
for (; i < kLanes; ++i) {
|
||||||
@@ -1702,13 +1702,13 @@ template <std::integral T, int kLanes> struct simd<T, kLanes, Simd_x86_AVX2> {
|
|||||||
for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) {
|
for (; i + 32 / sizeof(T) <= kLanes; i += 32 / sizeof(T)) {
|
||||||
__m256i v0;
|
__m256i v0;
|
||||||
memcpy(&v0, &x[i], 32);
|
memcpy(&v0, &x[i], 32);
|
||||||
v0 = _mm256_xor_si256(v0, _mm256_set1_epi8(0xff));
|
v0 = _mm256_xor_si256(v0, _mm256_set1_epi8((char)0xff));
|
||||||
memcpy(&result.x[i], &v0, 32);
|
memcpy(&result.x[i], &v0, 32);
|
||||||
}
|
}
|
||||||
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
|
for (; i + 16 / sizeof(T) <= kLanes; i += 16 / sizeof(T)) {
|
||||||
__m128i v0;
|
__m128i v0;
|
||||||
memcpy(&v0, &x[i], 16);
|
memcpy(&v0, &x[i], 16);
|
||||||
v0 = _mm_xor_si128(v0, _mm_set1_epi8(0xff));
|
v0 = _mm_xor_si128(v0, _mm_set1_epi8((char)0xff));
|
||||||
memcpy(&result.x[i], &v0, 16);
|
memcpy(&result.x[i], &v0, 16);
|
||||||
}
|
}
|
||||||
for (; i < kLanes; ++i) {
|
for (; i < kLanes; ++i) {
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
|
_GLOBAL_OFFSET_TABLE_
|
||||||
__cpu_indicator_init
|
__cpu_indicator_init
|
||||||
__cpu_model
|
__cpu_model
|
||||||
|
__stack_chk_fail@GLIBC_2.4
|
||||||
free@GLIBC_2.2.5
|
free@GLIBC_2.2.5
|
||||||
malloc@GLIBC_2.2.5
|
malloc@GLIBC_2.2.5
|
||||||
memmove@GLIBC_2.2.5
|
memmove@GLIBC_2.2.5
|
||||||
|
|||||||
Reference in New Issue
Block a user