Modify item metadata

Use a pre-push extension to add, update, or remove metadata for crawled items before indexing.

The script creates an Extensions folder under the COVEO_LOGS_ROOT folder (if it doesn’t already exist) and a subfolder named after the source ID. The script logs relevant information about each crawled item in a .log file in that folder.

Tip
Leading practice

Apply the extension to a duplicate of your production source with a name that clearly indicates it’s for testing purposes only. In this test source, crawl only a small subset of content for faster debugging and to limit the log file size.

Only after fully testing and validating the pre-push extension in the test source should you apply it to your production source.

After you apply this extension, rebuild the source.

# Import required Python libraries. Note: Add non-Python standard libraries to the requirements.txt file.
import os
import logging
from logging.handlers import TimedRotatingFileHandler
import json

# Initialize a rotating log handler
log_folder = os.path.join(
    os.getenv('COVEO_LOGS_ROOT'),
    "Extensions",
    os.getenv("SOURCE_ID", "unknown")
)
os.makedirs(log_folder, exist_ok=True)

fname = f"{os.getenv('OPERATION_TYPE','unknown')}_{os.getenv('OPERATION_ID','unknown')}.log"
fpath = os.path.join(log_folder, fname)

handler = TimedRotatingFileHandler(fpath, when="midnight")
handler.suffix = "%Y-%m-%d"

formatter = logging.Formatter(
    fmt="%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
handler.setFormatter(formatter)

logging.basicConfig(level=logging.INFO, handlers=[handler])

# -----------------------------------------------------------------
# Extension entry point. The do_extension function must be defined.
# -----------------------------------------------------------------
def do_extension(body):

    # Log basic item info
    document_id = body.get("DocumentId", "<missing>")
    logging.info("BEGIN processing item: %s", document_id)

    # Dynamic metadata logic
    metadata_key = "mymetadatakey"
    # Apply your logic to compute the metadata value
    metadata_value = f"value-for-{document_id}"

    # Add or update a piece of metadata
    body[metadata_key] = metadata_value
    logging.info("Set metadata: %s = %s", metadata_key, metadata_value)

    # Remove a piece of metadata
    removed_key = "mymetadatakeytoremove"
    if removed_key in body:
        logging.info("Removed metadata: %s", removed_key)
        body.pop(removed_key, None)
    else:
        logging.info("Metadata to remove not present: %s", removed_key)

    logging.info("END processing document: %s", document_id)

    return body