Source code for biothings.hub.databuild.syncer

import asyncio
import copy
import json
import os
import pickle
import sys
import time
from datetime import datetime
from functools import partial
from pprint import pformat

from elasticsearch.exceptions import ConflictError
from elasticsearch.helpers import BulkIndexError

import biothings.utils.jsonpatch as jsonpatch
from biothings import config as btconfig
from biothings.hub import SYNCER_CATEGORY
from biothings.utils.common import iter_n, loadobj, timesofar
from biothings.utils.hub_db import get_src_build
from biothings.utils.loggers import get_logger
from biothings.utils.manager import BaseManager
from biothings.utils.mongo import doc_feeder, get_target_db, invalidate_cache
from biothings.utils.storage import UpsertStorage

from .backend import create_backend, generate_folder

logging = btconfig.logger



[docs]
class SyncerException(Exception):
    pass




[docs]
class BaseSyncer(object):
    # diff type name, identifying the diff algorithm
    # must be set in sub-class
    diff_type = None

    # backend used to sync data (mongo / es)
    # must be set in sub-class
    target_backend_type = None

    def __init__(self, job_manager, log_folder):
        self.log_folder = log_folder
        self.job_manager = job_manager
        self.timestamp = datetime.now()
        self.ti = time.time()
        self.synced_cols = None  # str representation of synced cols (internal usage)
        self.setup_log()
        # set by manager during instanciation
        self.old = None
        self.new = None
        self.target_backend = None
        self._meta = None


[docs]
    def setup_log(self, build_name=None):
        log_folder = None
        if build_name:
            log_folder = os.path.join(btconfig.LOG_FOLDER, "build", build_name) if btconfig.LOG_FOLDER else None
        self.logger, self.logfile = get_logger("sync", log_folder=log_folder, force=True)



[docs]
    def get_predicates(self):
        # def no_same_syncer_running(job_manager):
        #    """Avoid syncers collision"""
        #    return len([j for j in job_manager.jobs.values() if \
        #            j["source"] == self.synced_cols and j["category"] == SYNCER_CATEGORY]) == 0
        return []



[docs]
    def get_pinfo(self):
        pinfo = {"category": SYNCER_CATEGORY, "step": "", "description": ""}
        preds = self.get_predicates()
        if preds:
            pinfo["__predicates__"] = preds
        return pinfo



[docs]
    def register_status(self, status, transient=False, init=False, **extra):
        src_build = get_src_build()
        job_info = {
            "status": status,
            "step_started_at": datetime.now().astimezone(),
            "logfile": self.logfile,
        }
        # to select correct diff sub-record (1 collection can be diffed with multiple others)
        diff_key = "%s" % self.old.target_name
        # once in diff, select correct sync sub-record (1 diff can be applied to different backend)
        # replace dots as hostname can have dots which could be interpreted as dotted field by mongo
        # also remove doc_type (which can be sometimes None if hub deals with multiple APIs,
        # and is not useful in distinguishing where the diff was applid since there's only one
        # doc type allowed now since ES6 (last element in self.target_backend is doc_type)
        sync_key = "-".join(self.target_backend[:-1]).replace(".", "-")
        sync_info = {sync_key: {}}
        if transient:
            # record some "in-progress" information
            job_info["pid"] = os.getpid()
        else:
            # only register time when it's a final state
            job_info["time"] = timesofar(self.ti)
            t1 = round(time.time() - self.ti, 0)
            job_info["time_in_s"] = t1
            sync_info[sync_key]["created_at"] = datetime.now().astimezone()
        if "sync" in extra:
            sync_info[sync_key].update(extra["sync"])
        if "job" in extra:
            job_info.update(extra["job"])
        # since the base is the merged collection, we register info there
        # as the new collection (diff results are associated to the most recent colleciton)
        build = src_build.find_one({"_id": self.new.target_name})
        if not build:
            self.logger.info("Can't find build document '%s', no status to register" % self.new.target_name)
            return
        assert "diff" in build and diff_key in build["diff"], "Missing previous diff information in build document"
        if init:
            # init timer for this step
            self.ti = time.time()
            src_build.update({"_id": self.new.target_name}, {"$push": {"jobs": job_info}})
            # now refresh/sync
            build = src_build.find_one({"_id": self.new.target_name})
        else:
            # merge extra at root level
            # (to keep building data...) and update the last one
            # (it's been properly created before when init=True)
            build["jobs"] and build["jobs"][-1].update(job_info)

            def merge_info(target, d):
                if "__REPLACE__" in d.keys():
                    d.pop("__REPLACE__")
                    target = d
                else:
                    for k, v in d.items():
                        if isinstance(v, dict):
                            if k in target:
                                target[k] = merge_info(target[k], v)
                            else:
                                v.pop("__REPLACE__", None)
                                # merge v with "nothing" just to make sure to remove any "__REPLACE__"
                                v = merge_info({}, v)
                                target[k] = v
                        else:
                            target[k] = v
                return target

            sync_info = {"sync": merge_info(build["diff"][diff_key].get("sync", {}), sync_info)}
            build["diff"][diff_key].update(sync_info)
            # src_build.update({'_id': build["_id"]}, {"$set": index_info})
            src_build.replace_one({"_id": build["_id"]}, build)



[docs]
    def load_metadata(self, diff_folder):
        self._meta = json.load(open(os.path.join(diff_folder, "metadata.json")))



[docs]
    def get_target_backend(self):
        # first try to use what's been passed explicitely
        # then default to what's in config (tuple will be used for create_backend() call)
        # or use what we have in the diff metadata
        old_db_col_names = (
            self.target_backend
            or (btconfig.ES_HOST, btconfig.ES_INDEX_NAME, btconfig.ES_DOC_TYPE)
            or self._meta["old"]["backend"]
        )
        return old_db_col_names



[docs]
    async def sync_cols(
        self,
        diff_folder,
        batch_size=10000,
        mode=None,
        force=False,
        target_backend=None,
        steps=("mapping", "content", "meta", "post"),
        debug=False,
    ):
        """
        Sync a collection with diff files located in diff_folder. This folder contains a metadata.json file which
        describes the different involved collection: "old" is the collection/index to be synced, "new" is the collecion
        that should be obtained once all diff files are applied (not used, just informative).
        If target_backend (bt.databbuild.backend.create_backend() notation), then it will replace "old" (that is, the one
        being synced)
        """
        if isinstance(steps, tuple):
            steps = list(steps)  # may not be necessary, but previous steps default is a list, so let's be consistent
        elif isinstance(steps, str):
            steps = [steps]
        assert self.old and self.new, "'self.old' and 'self.new' must be set to old/new collections"
        self.target_backend = target_backend
        got_error = False
        cnt = 0
        jobs = []
        self.load_metadata(diff_folder)
        meta = self._meta
        diff_type = self.diff_type
        selfcontained = "selfcontained" in self._meta["diff"]["type"]
        old_db_col_names = self.get_target_backend()
        new_db_col_names = self._meta["new"]["backend"]

        self.setup_log(new_db_col_names)

        diff_mapping_file = self._meta["diff"]["mapping_file"]
        pinfo = self.get_pinfo()
        self.synced_cols = "%s -> %s" % (old_db_col_names, new_db_col_names)
        pinfo["source"] = self.synced_cols
        summary = {}
        if "mapping" in steps and self.target_backend_type == "es":
            if diff_mapping_file:
                # old_db_col_names is actually the index name in that case
                index_name = old_db_col_names[1]
                doc_type = self._meta["build_config"]["doc_type"]
                indexer = create_backend(old_db_col_names).target_esidxer
                pinfo["step"] = "mapping"
                pinfo["description"] = diff_mapping_file

                def update_mapping():
                    diffm = os.path.join(diff_folder, diff_mapping_file["name"])
                    ops = loadobj(diffm)
                    mapping = indexer.get_mapping()
                    # we should have the same doc type declared in the mapping
                    mapping[doc_type]["properties"] = jsonpatch.apply_patch(mapping[doc_type]["properties"], ops)
                    res = indexer.update_mapping(mapping)
                    return res

                self.register_status("syncing", transient=True, init=True, job={"step": "sync-mapping"})
                job = await self.job_manager.defer_to_thread(pinfo, partial(update_mapping))

                def updated(f):
                    try:
                        _ = f.result()
                        self.logger.info("Mapping updated on index '%s'" % index_name)
                        summary["mapping_updated"] = True
                        self.register_status("success", job={"step": "sync-mapping"}, sync=summary)
                    except Exception as e:
                        nonlocal got_error
                        self.logger.error("Failed to update mapping on index '%s': %s" % (index_name, e))
                        self.register_status("failed", job={"err": repr(e)})
                        got_error = e

                job.add_done_callback(updated)
                await job

            if got_error:
                self.logger.error(
                    "Failed to update mapping on index '%s': %s" % (old_db_col_names, got_error), extra={"notify": True}
                )
                raise got_error

        if "content" in steps:
            if selfcontained:
                # selfconained is a worker param, isolate diff format
                diff_type = diff_type.replace("-selfcontained", "").replace("-", "_")
            diff_files = [
                (os.path.join(diff_folder, e["name"]), e.get("worker_args", {})) for e in self._meta["diff"]["files"]
            ]
            total = len(diff_files)
            self.logger.info(
                "Syncing %s to %s using diff files in '%s'" % (old_db_col_names, new_db_col_names, diff_folder)
            )
            pinfo["step"] = "content"
            self.register_status("syncing", transient=True, init=True, job={"step": "sync-content"})
            for diff_file, worker_args in diff_files:
                cnt += 1
                pinfo["description"] = "file %s (%s/%s)" % (diff_file, cnt, total)
                worker = getattr(
                    sys.modules["biothings.hub.databuild.syncer"],
                    "sync_%s_%s_worker" % (self.target_backend_type, diff_type),
                )
                strwargs = worker_args and " using specific worker args %s" % repr(worker_args) or ""
                self.logger.info(
                    "Creating sync worker %s for file %s (%s/%s)%s" % (worker.__name__, diff_file, cnt, total, strwargs)
                )
                # deepcopy to make we don't embed "self" with unpickleable stuff
                meta = copy.deepcopy(self._meta)
                job = await self.job_manager.defer_to_process(
                    pinfo,
                    partial(
                        worker,
                        diff_file,
                        old_db_col_names,
                        new_db_col_names,
                        worker_args.get("batch_size") or batch_size,
                        cnt,
                        force,
                        selfcontained,
                        meta,
                        debug,
                    ),
                )
                jobs.append(job)

            def synced(f):
                try:
                    res = f.result()
                    for d in res:
                        for k in d:
                            summary.setdefault(k, 0)
                            summary[k] += d[k]
                except Exception as e:
                    nonlocal got_error
                    got_error = e
                    self.register_status("failed", job={"err": repr(e)})
                    raise

            tasks = asyncio.gather(*jobs)
            tasks.add_done_callback(synced)
            await tasks
            if got_error:
                self.logger.error(
                    "Failed to sync collection from %s to %s using diff files in '%s': %s"
                    % (old_db_col_names, new_db_col_names, diff_folder, got_error),
                    extra={"notify": True},
                )
                raise got_error
            self.register_status("success", job={"step": "sync-content"}, sync=summary)

        if "meta" in steps and self.target_backend_type == "es":
            # old_db_col_names is actually the index name in that case
            index_name = old_db_col_names[1]
            doc_type = self._meta["build_config"]["doc_type"]
            indexer = create_backend(old_db_col_names).target_esidxer
            new_meta = self._meta["_meta"]
            pinfo["step"] = "metadata"

            def update_metadata():
                res = indexer.update_mapping_meta({"_meta": new_meta})
                return res

            job = await self.job_manager.defer_to_thread(pinfo, partial(update_metadata))

            def updated(f):
                try:
                    res = f.result()
                    self.logger.info("Metadata updated on index '%s': %s", index_name, res)
                    summary["metadata_updated"] = True
                    self.register_status("success", job={"step": "sync-meta"}, sync=summary)
                except Exception as e:
                    nonlocal got_error
                    self.logger.error("Failed to update metadata on index '%s': %s", index_name, e)
                    self.register_status("failed", job={"err": repr(e)})
                    got_error = e

            self.register_status("syncing", transient=True, init=True, job={"step": "sync-meta"})
            job.add_done_callback(updated)
            await job

            if got_error:
                self.logger.error(
                    "Failed to update metadata on index '%s': %s" % (old_db_col_names, got_error),
                    extra={"notify": True},
                )
                raise got_error

        if "post" in steps:
            pinfo["step"] = "post"
            job = await self.job_manager.defer_to_thread(
                pinfo,
                partial(
                    self.post_sync_cols,
                    diff_folder=diff_folder,
                    batch_size=batch_size,
                    mode=mode,
                    force=force,
                    target_backend=target_backend,
                    steps=steps,
                ),
            )

            def posted(f):
                try:
                    res = f.result()
                    self.logger.info("Post-sync process done on index '%s': %s", repr(old_db_col_names), res)
                    summary["post-sync"] = True
                    self.register_status("success", job={"step": "sync-post"}, sync=summary)
                except Exception as e:
                    nonlocal got_error
                    self.logger.error("Failed to run post-sync process on index '%s': %s", repr(old_db_col_names), e)
                    self.register_status("failed", job={"err": repr(e)})
                    got_error = e

            self.register_status("syncing", transient=True, init=True, job={"step": "sync-post"})
            job.add_done_callback(posted)
            await job

            if got_error:
                self.logger.error(
                    "Failed to run post-sync process on index '%s': %s",
                    repr(old_db_col_names),
                    got_error,
                    extra={"notify": True},
                )
                raise got_error

        self.logger.info(
            "Succesfully synced index %s to reach collection %s using diff files in '%s': %s",
            old_db_col_names,
            new_db_col_names,
            diff_folder,
            summary,
            extra={"notify": True},
        )

        return summary



[docs]
    def post_sync_cols(self, diff_folder, batch_size, mode, force, target_backend, steps):
        """Post-sync hook, can be implemented in sub-class"""
        return



[docs]
    def sync(
        self,
        diff_folder=None,
        batch_size=10000,
        mode=None,
        target_backend=None,
        steps=("mapping", "content", "meta", "post"),
        debug=False,
    ):
        """wrapper over sync_cols() coroutine, return a task"""
        job = asyncio.ensure_future(
            self.sync_cols(
                diff_folder=diff_folder,
                batch_size=batch_size,
                mode=mode,
                target_backend=target_backend,
                steps=steps,
                debug=debug,
            )
        )
        return job





[docs]
class ThrottlerSyncer(BaseSyncer):
    def __init__(self, max_sync_workers, *args, **kwargs):
        super(ThrottlerSyncer, self).__init__(*args, **kwargs)
        self.max_sync_workers = max_sync_workers


[docs]
    def get_predicates(self):
        preds = super(ThrottlerSyncer, self).get_predicates()
        if preds is None:
            preds = []

        def not_too_much_syncers(job_manager):
            """
            Limit number of syncers accordingly (this is useful when live-updating
            the prod,we usually need to reduce the number of sync workers as they
            would kill the ES server otherwise... (or at least produces timeout errors)
            """
            return (
                len([j for j in job_manager.jobs.values() if j["category"] == SYNCER_CATEGORY]) < self.max_sync_workers
            )

        preds.append(not_too_much_syncers)
        return preds





[docs]
class MongoJsonDiffSyncer(BaseSyncer):
    diff_type = "jsondiff"
    target_backend_type = "mongo"




[docs]
class MongoJsonDiffSelfContainedSyncer(BaseSyncer):
    diff_type = "jsondiff-selfcontained"
    target_backend_type = "mongo"




[docs]
class ESJsonDiffSyncer(BaseSyncer):
    diff_type = "jsondiff"
    target_backend_type = "es"




[docs]
class ESJsonDiffSelfContainedSyncer(BaseSyncer):
    diff_type = "jsondiff-selfcontained"
    target_backend_type = "es"




[docs]
class ESColdHotJsonDiffSyncer(BaseSyncer):
    diff_type = "coldhot-jsondiff"
    target_backend_type = "es"




[docs]
class ESColdHotJsonDiffSelfContainedSyncer(BaseSyncer):
    diff_type = "coldhot-jsondiff-selfcontained"
    target_backend_type = "es"




[docs]
class ThrottledESJsonDiffSyncer(ThrottlerSyncer, ESJsonDiffSyncer):
    pass




[docs]
class ThrottledESJsonDiffSelfContainedSyncer(ThrottlerSyncer, ESJsonDiffSelfContainedSyncer):
    pass




[docs]
class ThrottledESColdHotJsonDiffSyncer(ThrottlerSyncer, ESColdHotJsonDiffSyncer):
    pass




[docs]
class ThrottledESColdHotJsonDiffSelfContainedSyncer(ThrottlerSyncer, ESColdHotJsonDiffSelfContainedSyncer):
    pass



# TODO: refactor workers (see sync_es_...)

[docs]
def sync_mongo_jsondiff_worker(
    diff_file,
    old_db_col_names,
    new_db_col_names,
    batch_size,
    cnt,
    force=False,
    selfcontained=False,
    metadata=None,
    debug=False,
):
    """Worker to sync data between a new and an old mongo collection"""
    metadata = metadata or {}
    # check if diff files was already synced
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it", os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(diff["update"])
        return res
    new = create_backend(new_db_col_names)
    old = create_backend(old_db_col_names)
    storage = UpsertStorage(get_target_db(), old.target_collection.name, logging)
    diff = loadobj(diff_file)
    assert new.target_collection.name == diff["source"], "Source is different in diff file '%s': %s" % (
        diff_file,
        diff["source"],
    )

    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, not mongo needed
        for docs in iter_n(diff["add"], batch_size):
            res["added"] += storage.process((d for d in docs), batch_size)
    else:
        cur = doc_feeder(new.target_collection, step=batch_size, inbatch=False, query={"_id": {"$in": diff["add"]}})
        for docs in iter_n(cur, batch_size):
            # use generator otherwise process/doc_iterator will require a dict (that's bad...)
            res["added"] += storage.process((d for d in docs), batch_size)

    # update: get doc from "old" and apply diff
    batch = []
    for patch_info in diff["update"]:
        doc = old.get_from_id(patch_info["_id"])
        try:
            doc = jsonpatch.apply_patch(doc, patch_info["patch"])
            batch.append(doc)
        except jsonpatch.JsonPatchConflict:
            # assuming already applieda
            res["skipped"] += 1
            continue
        if len(batch) >= batch_size:
            res["updated"] += storage.process((d for d in batch), batch_size)
            batch = []
    if batch:
        res["updated"] += storage.process((d for d in batch), batch_size)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        res["deleted"] += old.remove_from_ids(ids)

    # we potentially modified the "old" collection so invalidate cache just to make sure
    invalidate_cache(old.target_collection.name, "target")
    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res




[docs]
def sync_es_jsondiff_worker(
    diff_file,
    es_config,
    new_db_col_names,
    batch_size,
    cnt,
    force=False,
    selfcontained=False,
    metadata=None,
    debug=False,
):
    """Worker to sync data between a new mongo collection and an elasticsearch index"""
    metadata = metadata or {}
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it", os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(diff["update"])
        return res
    eskwargs = {}
    # pass optional ES Indexer args
    if hasattr(btconfig, "ES_TIMEOUT"):
        eskwargs["timeout"] = btconfig.ES_TIMEOUT
    if hasattr(btconfig, "ES_MAX_RETRY"):
        eskwargs["max_retries"] = btconfig.ES_MAX_RETRY
    if hasattr(btconfig, "ES_RETRY"):
        eskwargs["retry_on_timeout"] = btconfig.ES_RETRY
    logging.debug("Create ES backend with args: (%s,%s)", es_config, eskwargs)
    bckend = create_backend(es_config, **eskwargs)
    indexer = bckend.target_esidxer
    diff = loadobj(diff_file)
    errors = []
    # add: get ids from "new"
    if selfcontained:
        # diff["add"] contains all documents, no mongo needed
        cur = diff["add"]
    else:
        new = create_backend(new_db_col_names)  # mongo collection to sync from
        assert new.target_collection.name == diff["source"], "Source is different in diff file '%s': %s" % (
            diff_file,
            diff["source"],
        )
        cur = doc_feeder(
            new.target_collection,
            step=batch_size,
            inbatch=False,
            query={"_id": {"$in": diff["add"]}},
        )
    for docs in iter_n(cur, batch_size):
        # remove potenial existing _timestamp from document
        # (not allowed within an ES document (_source))
        [d.pop("_timestamp", None) for d in docs]
        try:
            res["added"] += indexer.index_bulk(docs, batch_size, action="create")[0]
        except BulkIndexError:
            for doc in docs:
                _id = doc.pop("_id")
                try:
                    # force action=create to spot docs already added
                    indexer.index(doc, _id, action="create")
                    res["added"] += 1
                except ConflictError:
                    # already added
                    logging.warning("_id '%s' already added" % _id)
                    res["skipped"] += 1
                    continue
                except Exception as e:
                    errors.append({"_id": _id, "file": diff_file, "error": e})
                    import pickle

                    pickle.dump(errors, open("errors", "wb"))
                    raise
        except Exception as e:
            if debug:
                logging.error(
                    "From diff file '%s', following IDs couldn't be synced because: %s\n%s",
                    diff_file,
                    e,
                    [d.get("_id") for d in docs],
                )
                pickfile = "batch_%s_%s.pickle" % (cnt, os.path.basename(diff_file))
                logging.error("Documents pickled in '%s'" % pickfile)
                pickle.dump(docs, open(pickfile, "wb"))
            raise

    # update: get doc from indexer and apply diff
    sync_es_for_update(diff_file, indexer, diff["update"], batch_size, res, debug)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        # FIXME: bulk delete can fail
        del_skip = indexer.delete_docs(ids)
        res["deleted"] += del_skip[0]
        res["skipped"] += del_skip[1]

    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res




[docs]
def sync_es_coldhot_jsondiff_worker(
    diff_file,
    es_config,
    new_db_col_names,
    batch_size,
    cnt,
    force=False,
    selfcontained=False,
    metadata=None,
    debug=False,
):
    metadata = metadata or {}
    res = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
    # check if diff files was already synced
    synced_file = "%s.synced" % diff_file
    if os.path.exists(synced_file):
        logging.info("Diff file '%s' already synced, skip it", os.path.basename(diff_file))
        diff = loadobj(synced_file)
        res["skipped"] += len(diff["add"]) + len(diff["delete"]) + len(diff["update"])
        return res
    eskwargs = {}
    # pass optional ES Indexer args
    if hasattr(btconfig, "ES_TIMEOUT"):
        eskwargs["timeout"] = btconfig.ES_TIMEOUT
    if hasattr(btconfig, "ES_MAX_RETRY"):
        eskwargs["max_retries"] = btconfig.ES_MAX_RETRY
    if hasattr(btconfig, "ES_RETRY"):
        eskwargs["retry_on_timeout"] = btconfig.ES_RETRY
    logging.debug("Create ES backend with args: (%s,%s)", es_config, eskwargs)
    bckend = create_backend(es_config, **eskwargs)
    indexer = bckend.target_esidxer
    diff = loadobj(diff_file)

    # add: diff between hot collections showed we have new documents but it's
    # possible some of those docs already exist in premerge/cold collection.
    # if so, they should be treated as dict.update() where the hot document content
    # has precedence over the cold content for fields in common
    if selfcontained:
        # diff["add"] contains all documents, no mongo needed
        cur = diff["add"]
    else:
        new = create_backend(new_db_col_names)  # mongo collection to sync from
        assert new.target_collection.name == diff["source"], "Source is different in diff file '%s': %s" % (
            diff_file,
            diff["source"],
        )
        cur = doc_feeder(
            new.target_collection,
            step=batch_size,
            inbatch=False,
            query={"_id": {"$in": diff["add"]}},
        )
    for docs in iter_n(cur, batch_size):
        # remove potenial existing _timestamp from document
        # (not allowed within an ES document (_source))
        [d.pop("_timestamp", None) for d in docs]
        # check which docs already exist in existing index (meaning they exist in cold collection)
        dids = dict([(d["_id"], d) for d in docs])
        dexistings = dict([(d["_id"], d) for d in indexer.get_docs([k for k in dids.keys()])])
        logging.debug("From current batch, %d already exist" % len(dexistings))
        # remove existing docs from "add" so the rest of the dict will be treated
        # as "real" added documents while update existing ones with new content
        toremove = []
        for _id, d in dexistings.items():
            # update in-place
            if d == dids[d["_id"]]:
                logging.debug("%s was already added, skip it" % d["_id"])
                toremove.append(d["_id"])
                res["skipped"] += 1
            else:
                newd = copy.deepcopy(d)
                d.update(dids[d["_id"]])
                if d == newd:
                    logging.debug("%s was already updated, skip it" % d["_id"])
                    toremove.append(d["_id"])
                    res["skipped"] += 1
            dids.pop(d["_id"])
        for _id in toremove:
            dexistings.pop(_id)
        logging.info(
            "Syncing 'add' documents (%s in total) from cold/hot merge: " % len(docs)
            + "%d documents will be updated as they already exist in the index, " % len(dexistings)
            + "%d documents will be added (%d skipped as already processed)" % (len(dids), len(toremove))
        )
        # treat real "added" documents
        # Note: no need to check for "already exists" errors, as we already checked that before
        # in order to know what to do
        try:
            res["added"] += indexer.index_bulk(dids.values(), batch_size, action="create")[0]
        except BulkIndexError:
            logging.error("Error while adding documents %s" % [k for k in dids.keys()])
        # update already existing docs in cold collection
        # treat real "added" documents
        try:
            res["updated"] += indexer.index_bulk(dexistings.values(), batch_size)[0]
        except BulkIndexError as e:
            logging.error("Error while updating (via new hot detected docs) documents: %s" % e)

    # update: get doc from indexer and apply diff
    # note: it's the same process as for non-coldhot
    sync_es_for_update(diff_file, indexer, diff["update"], batch_size, res, debug)

    # delete: remove from "old"
    for ids in iter_n(diff["delete"], batch_size):
        del_skip = indexer.delete_docs(ids)
        res["deleted"] += del_skip[0]
        res["skipped"] += del_skip[1]

    logging.info("Done applying diff from file '%s': %s" % (diff_file, res))
    # mark as synced
    os.rename(diff_file, synced_file)
    return res




[docs]
def sync_es_for_update(diff_file, indexer, diffupdates, batch_size, res, debug):
    batch = []
    ids = [p["_id"] for p in diffupdates]
    iterids_bcnt = iter_n(ids, batch_size, True)
    for batchids, bcnt in iterids_bcnt:
        try:
            for i, doc in enumerate(indexer.get_docs(batchids)):
                # recompute correct index in diff["update"], since we split it in batches
                diffidx = i + bcnt - len(batchids)  # len(batchids) is not == batch_size for the last one...
                try:
                    patch_info = diffupdates[diffidx]  # same order as what's return by get_doc()...
                    assert patch_info["_id"] == doc["_id"], "%s != %s" % (
                        patch_info["_id"],
                        doc["_id"],
                    )  # ... but just make sure
                    newdoc = jsonpatch.apply_patch(doc, patch_info["patch"])
                    if newdoc == doc:
                        # already applied
                        logging.warning("_id '%s' already synced" % doc["_id"])
                        res["skipped"] += 1
                        continue
                    batch.append(newdoc)
                except jsonpatch.JsonPatchConflict as e:
                    # assuming already applied
                    logging.warning("_id '%s' already synced ? JsonPatchError: %s", doc["_id"], e)
                    res["skipped"] += 1
                    continue
                if len(batch) >= batch_size:
                    res["updated"] += indexer.index_bulk(batch, batch_size)[0]
                    batch = []
            if batch:
                res["updated"] += indexer.index_bulk(batch, batch_size)[0]
        except Exception as e:
            if debug:
                logging.error(
                    "From diff file '%s', %d IDs couldn't be synced because: %s\n%s", diff_file, e, len(batchids)
                )
                pickfile = "batch_sync_updater_%s_%s.pickle" % (bcnt, os.path.basename(diff_file))
                logging.error("IDs pickled in '%s'" % pickfile)
                pickle.dump(batchids, open(pickfile, "wb"))
            raise




[docs]
class SyncerManager(BaseManager):
    def __init__(self, *args, **kwargs):
        """
        SyncerManager deals with the different syncer objects used to synchronize
        different collections or indices using diff files
        """
        super(SyncerManager, self).__init__(*args, **kwargs)
        self.setup_log()


[docs]
    def clean_stale_status(self):
        src_build = get_src_build()
        for build in src_build.find():
            dirty = False
            for job in build.get("jobs", []):
                if job.get("status") == "syncing":
                    logging.warning("Found stale build '%s', marking sync status as 'canceled'", build["_id"])
                    job["status"] = "canceled"
                    dirty = True
            if dirty:
                src_build.replace_one({"_id": build["_id"]}, build)



[docs]
    def register_syncer(self, klass):
        if isinstance(klass, partial):
            assert isinstance(klass.func, type), "%s is not a class" % klass.func
            diff_type, target_backend_type = klass.func.diff_type, klass.func.target_backend_type
        else:
            diff_type, target_backend_type = klass.diff_type, klass.target_backend_type

        self.register[(diff_type, target_backend_type)] = partial(
            klass, log_folder=btconfig.LOG_FOLDER, job_manager=self.job_manager
        )



[docs]
    def configure(self, klasses=None):
        """
        Register default syncers (if klasses is None) or given klasses.
        klasses is a list of class, or a list of partial'ly initialized classes.
        """
        klasses = klasses or [
            MongoJsonDiffSyncer,
            ESJsonDiffSyncer,
            MongoJsonDiffSelfContainedSyncer,
            ESJsonDiffSelfContainedSyncer,
        ]
        for klass in klasses:
            self.register_syncer(klass)



[docs]
    def setup_log(self):
        self.logger, self.logfile = get_logger("syncmanager")


    def __getitem__(self, diff_target):
        """
        Return an instance of a builder for the build named 'build_name'
        Note: each call returns a different instance (factory call behind the scene...)
        """
        # we'll get a partial class but will return an instance
        pclass = BaseManager.__getitem__(self, diff_target)
        return pclass()


[docs]
    def sync(
        self,
        backend_type,
        old_db_col_names,
        new_db_col_names,
        diff_folder=None,
        batch_size=10000,
        mode=None,
        target_backend=None,
        steps=("mapping", "content", "meta", "post"),
        debug=False,
    ):
        if isinstance(steps, tuple):
            steps = list(steps)  # may not be necessary, but previous steps default is a list, so let's be consistent
        elif isinstance(steps, str):
            steps = [steps]
        if hasattr(btconfig, "SYNC_BATCH_SIZE"):
            batch_size = btconfig.SYNC_BATCH_SIZE
            self.logger.debug("Overriding sync batch_size default to %s" % batch_size)
        if diff_folder is None:
            diff_folder = generate_folder(btconfig.DIFF_PATH, old_db_col_names, new_db_col_names)
        if not os.path.exists(diff_folder):
            raise FileNotFoundError(f"Directory '{diff_folder}' does not exist, run a diff first")
        # load metadata to know collections that have been diffed in diff_func protocol
        try:
            meta = json.load(open(os.path.join(diff_folder, "metadata.json")))
        except FileNotFoundError:
            self.logger.error("Can't find metadata file in diff folder '%s'", diff_folder)
            raise

        self.logger.info("Found metadata information: %s" % pformat(meta))
        try:
            diff_type = meta["diff"]["type"]
        except KeyError:
            msg = "Can't find diff_type in metadata file located in '%s'" % diff_folder
            raise SyncerException(msg)

        try:
            syncer = self[(diff_type, backend_type)]
            self.logger.info("Selected syncer: %s" % syncer)
            syncer.old = create_backend(old_db_col_names)
            syncer.new = create_backend(new_db_col_names)
            job = syncer.sync(
                diff_folder,
                batch_size=batch_size,
                mode=mode,
                target_backend=target_backend,
                steps=steps,
                debug=debug,
            )
            return job
        except KeyError as e:
            raise SyncerException("No such syncer (%s,%s) (error: %s)" % (diff_type, backend_type, e))