Process Super Big JSON File#

有的时候我们需要处理体积巨大的 JSON 文件. 如果是 multi line JSON 文件, 那事情就比较简单了, 你可以轻易的遍历所有行, 只是花点时间而已. 而如果是一个巨大的 Object, 一个 array 最终是 nest 在很深的地方, 并且整个 JSON 仅仅只有 1 行, 这就比较难办了.

主要思路

一般一个 JSON 之所以大, 通常是有某一个 Array 里面有非常多的 item. 或者一个 Dictionary 里面有很多很多的 Key (1, 2, 3, …). 所以我们只要能将这些 Item 分拆出来成为小文件, 然后把剩下的数据写入到另外一个文件.

工具

ijson 是一个基于 C 的 Python 库, 能顺序的读入字节流来解析 JSON, 这样能节约非常多的内存. 基于 ijson 库, 我们能定位到有很多 item 的 Array 节点, 然后将这些数据拆分出来. 换言之, 我们用额外的时间节约了内存空间.

下面是两个脚本, 分别是处理本地文件, 和处理在 S3 上的文件的版本.

# -*- coding: utf-8 -*-

"""
This is a solution can split large JSON file into smaller chunks (if there is a
big array node) without using much memory.

Use case 1:

Input::

    {
        "id": 1,
        "records": [
            {"key": 1},
            {"key": 2},
            {"key": 3},
            ...
        ],
        "name: "alice"
    }

Output::

    # data.json
    {
        "id": 1,
        "name: "alice"
    }

    # arrays/1.json
    [
        {"key": 1},
        {"key": 2},
        ...
        {"key": 10}
    ]

    # arrays/2.json
    [
        {"key": 11},
        {"key": 12},
        ...
        {"key": 20}
    ]

    # arrays/10.json
    [
        {"key": 91},
        {"key": 92},
        ...
        {"key": 100}
    ]

Use case 2:

Input::

    {
        "id": 1,
        "data": {
            "date": "2000-01-01",
            "records": [
                {"key": 1},
                {"key": 2},
                {"key": 3},
                ...
            ],
        },
        "name: "alice"
    }

Output::

    # data.json
    {
        "id": 1,
        "data": {
            "date": "2000-01-01",
        },
        "name: "alice"
    }

    # arrays/1.json
    [
        {"key": 1},
        {"key": 2},
        ...
        {"key": 10}
    ]

    # arrays/2.json
    [
        {"key": 11},
        {"key": 12},
        ...
        {"key": 20}
    ]

    # arrays/10.json
    [
        {"key": 91},
        {"key": 92},
        ...
        {"key": 100}
    ]

Benchmark result:

- Input file: 1G, split into 10 files
- Macbook pro 1: 32G memory + SSD
- Time: 12s
"""

import typing as T
import json
import ijson
import shutil
import itertools
from pathlib import Path
from datetime import datetime

dir_here = Path(__file__).parent


def make_data(path: Path):
    # n_records = 10
    # str_length = 10

    n_records = 1000
    str_length = 1000000

    data = {
        "id": 1,
        "data": {
            "date": "2000-01-01",
            "records": [
                {"k": i, "v": "a" * str_length} for i in range(1, 1 + n_records)
            ],
        },
        "name": "alice",
    }

    # data = {
    #     "id": 1,
    #     "records": [
    #         {"k": i, "v": "a" * str_length}
    #         for i in range(1, 1 + n_file)
    #     ],
    #     "name": "alice",
    # }

    with path.open("w") as f:
        json.dump(data, f)


def delete_node(
    p_in: Path,
    json_path: str,
) -> dict:
    """
    Example::

        # example 1
        >>> input_data = {
        ...     "id": 1,
        ...     "delete": [],
        ... },
        >>> json_path = "delete"
        >>> print(output_data)
        {
            "id": 1
        },

        # example 2
        >>> input_data ={
        ...     "id": 1,
        ...     "a": {
        ...         "delete": []
        ...     },
        ... }
        >>> json_path = "a.delete"
        >>> print(output_data)
        {
            "id": 1,
            "a": {}
        }

        # example 3
        >>> input_data ={
        ...     "id": 1,
        ...     "a": {
        ...         "a_value": 2,
        ...         "b": {
        ...             "delete": [],
        ...             "b_value": 3,
        ...         },
        ...     },
        ... }
        >>> json_path = "a.b.delete"
        >>> print(output_data)
        {
            "id": 1,
            "a": {
                "a_value": 2,
                "b": {
                    "b_value": 3
                }
            }
        }
    """
    parts = json_path.split(".")
    prefix_and_key_pairs = []
    lst = list()
    for part in parts:
        prefix = ".".join(lst)
        key = part
        prefix_and_key_pairs.append((prefix, key))
        lst.append(part)

    new_data = dict()
    parent_data = new_data
    for prefix, key in prefix_and_key_pairs:
        # print(f"------ prefix = {prefix}, key = {key} ------")
        data = dict()
        with p_in.open("r") as f_in:
            for k, v in ijson.kvitems(f_in, prefix):
                if k != key:
                    data[k] = v
        if prefix == "":
            new_data = data
        else:
            parent_data[prefix.split(".")[-1]] = data
        parent_data = data
        # print("new_data:", new_data)

    return new_data


def take(n: int, iterable: T.Iterable):
    """
    Return first n items of the iterable as a list
    """
    return list(itertools.islice(iterable, n))


def split_json(
    p_in: Path,
    dir_out: Path,
    json_path: str,
    chunk_size: int,
):
    """
    :param p_in: input data path
    :param dir_out: output data directory, it should not exist
    :param json_path: the json path in dot notation to the array you want to split
    :param chunk_size: group items in the array into chunks of this size
    """
    if dir_out.exists():
        raise FileExistsError(f"{dir_out} already exists")

    path_data = dir_out.joinpath("data.json")
    dir_arrays = dir_out.joinpath("arrays")
    dir_arrays.mkdir(parents=True)

    # split the big json array into many small json arrays
    with p_in.open("r") as f_in:
        iterator = ijson.items(f_in, f"{json_path}.item")
        for ith in range(1, 1 + 999):
            items = take(chunk_size, iterator)
            path_out = dir_arrays.joinpath(f"{ith}.json")
            if len(items) == 0:
                break
            else:
                with path_out.open("w") as f_out:
                    json.dump(items, f_out)

    # delete the big json array node from the original json
    data = delete_node(p_in=p_in, json_path=json_path)
    with path_data.open("w") as f_out:
        json.dump(data, f_out)


if __name__ == "__main__":
    dir_output = dir_here / "output"
    path_data = dir_here / "data.json"

    def test_delete_node():
        path = dir_here.joinpath("test_delete_node.json")
        input_output_jsonpath = [
            (
                {
                    "id": 1,
                    "delete": [],
                },
                {
                    "id": 1,
                },
                "delete",
            ),
            (
                {
                    "id": 1,
                    "a": {"delete": []},
                },
                {
                    "id": 1,
                    "a": {},
                },
                "a.delete",
            ),
            (
                {
                    "id": 1,
                    "a": {
                        "a_value": 2,
                        "b": {
                            "delete": [],
                            "b_value": 3,
                        },
                    },
                },
                {
                    "id": 1,
                    "a": {
                        "a_value": 2,
                        "b": {
                            "b_value": 3,
                        },
                    },
                },
                "a.b.delete",
            ),
        ]
        for input_data, output_data, jsonpath in input_output_jsonpath:
            path.write_text(json.dumps(input_data))
            result = delete_node(path, json_path=jsonpath)
            assert result == output_data

    def test_split_json():
        shutil.rmtree(dir_output, ignore_errors=True)
        st = datetime.utcnow()
        split_json(
            p_in=path_data,
            dir_out=dir_output,
            json_path="data.records",
            chunk_size=120,
        )
        et = datetime.utcnow()
        elapse = (et - st).total_seconds()
        print(f"elapsed time: {elapse:.2f} seconds")

    # test_delete_node()
    # make_data(path_data)
    # test_split_json()

# -*- coding: utf-8 -*-

"""
This is a solution can split large JSON file into smaller chunks (if there is a
big array node) without using much memory.

Requirements::

    ijson

**Example 1**

- Input file: 1G, split into 10 files
- Memory: 10G

if only split:

- Duration: 25s
- Max Memory Used: 540 MB

if also delete the node

- Duration: 50s
- Max Memory Used: 1037 MB

**Example 2**

- Input file: 1G, split into 10 files
- Memory: 2G

if only split:

- Duration: 25s
- Max Memory Used: 540 MB

if also delete the node

- Duration: 50s
- Max Memory Used: 1037 MB
"""

import typing as T
import json
import ijson
import dataclasses
import itertools

import boto3


s3_client = boto3.client("s3")


def split_s3_uri(uri: str) -> T.Tuple[str, str]:
    parts = uri.split("/", 3)
    bucket = parts[2]
    key = parts[3]
    return bucket, key


def get_object(uri: str):
    bucket, key = split_s3_uri(uri)
    return s3_client.get_object(
        Bucket=bucket,
        Key=key,
    )


def put_object(uri: str, body):
    bucket, key = split_s3_uri(uri)
    return s3_client.put_object(
        Bucket=bucket,
        Key=key,
        Body=body,
    )


def delete_node(
    s3uri: str,
    json_path: str,
) -> dict:
    """
    Read json file from s3, delete node at certain json path, and return the
    json data with the node deleted.

    Example::

        # example 1
        >>> input_data = {
        ...     "id": 1,
        ...     "delete": [],
        ... },
        >>> json_path = "delete"
        >>> print(output_data)
        {
            "id": 1
        },

        # example 2
        >>> input_data ={
        ...     "id": 1,
        ...     "a": {
        ...         "delete": []
        ...     },
        ... }
        >>> json_path = "a.delete"
        >>> print(output_data)
        {
            "id": 1,
            "a": {}
        }

        # example 3
        >>> input_data ={
        ...     "id": 1,
        ...     "a": {
        ...         "a_value": 2,
        ...         "b": {
        ...             "delete": [],
        ...             "b_value": 3,
        ...         },
        ...     },
        ... }
        >>> json_path = "a.b.delete"
        >>> print(output_data)
        {
            "id": 1,
            "a": {
                "a_value": 2,
                "b": {
                    "b_value": 3
                }
            }
        }
    """
    parts = json_path.split(".")
    prefix_and_key_pairs = []
    lst = list()
    for part in parts:
        prefix = ".".join(lst)
        key = part
        prefix_and_key_pairs.append((prefix, key))
        lst.append(part)

    new_data = dict()
    parent_data = new_data
    for prefix, key in prefix_and_key_pairs:
        # print(f"------ prefix = {prefix}, key = {key} ------")
        data = dict()
        with get_object(s3uri)["Body"] as f_in:
            for k, v in ijson.kvitems(f_in, prefix):
                if k != key:
                    data[k] = v
        if prefix == "":
            new_data = data
        else:
            parent_data[prefix.split(".")[-1]] = data
        parent_data = data
        # print("new_data:", new_data)

    return new_data


def take(n: int, iterable: T.Iterable):
    """
    Return first n items of the iterable as a list
    """
    return list(itertools.islice(iterable, n))


def split_json(
    s3file_input: str,
    s3dir_output: str,
    json_path: str,
    chunk_size: int,
):
    """
    :param s3file_input: the s3 uri of the input JSON file
    :param s3dir_output: the s3 uri of the output directory, it suppose to be empty
    :param json_path: the json path in dot notation to the array you want to split
    :param chunk_size: group items in the array into chunks of this size
    """
    json_path = json_path.strip(".")

    s3file_data = f"{s3dir_output}data.json"
    s3dir_arrays = f"{s3dir_output}arrays/"

    # split the big json array into many small json arrays
    with get_object(s3file_input)["Body"] as f_in:
        iterator = ijson.items(f_in, f"{json_path}.item")
        for ith in range(1, 1 + 999):
            items = take(chunk_size, iterator)
            s3path_output = f"{s3dir_arrays}{ith}.json"
            if len(items) == 0:
                break
            else:
                put_object(
                    s3path_output,
                    "\n".join([
                        json.dumps(item)
                        for item in items
                    ])
                )

    # delete the big json array node from the original json
    data = delete_node(s3uri=s3file_input, json_path=json_path)
    put_object(s3file_data, json.dumps(data))


@dataclasses.dataclass
class Request:
    """
    Lambda request event

    :param s3file_input: the s3 uri of the input JSON file
    :param s3dir_output: the s3 uri of the output directory, it suppose to be empty
    :param json_path: the json path in dot notation to the array you want to split
    :param chunk_size: group items in the array into chunks of this size
    """
    s3file_input: str
    s3dir_output: str
    json_path: str
    chunk_size: int


def lambda_handler(event, context):
    """
    Example event::

        {
            "s3file_input": "s3://807388292768-us-east-1-data/tmp/data.json",
            "s3dir_output": "s3://807388292768-us-east-1-data/tmp/output/",
            "json_path": "data.records",
            "chunk_size": 120
        }
    """
    request = Request(**event)
    split_json(
        s3file_input=request.s3file_input,
        s3dir_output=request.s3dir_output,
        json_path=request.json_path,
        chunk_size=request.chunk_size,
    )
    return {"statusCode": 200}