Add item data
Add item data
To index items that store their data outside the source, you can create a pre-push extension.
For example, you might be using a Database source, where the database contains item metadata, including a path to a file that stores the item data.
Use a pre-push extension to open the file, extract its content, and add it to the item body.
The following example shows how.
The script creates an Extensions folder under the COVEO_LOGS_ROOT folder (if it doesn’t already exist) and a subfolder named after the source ID.
The script logs relevant information about each crawled item in a .log file in that folder.
|
|
Leading practice
Apply the extension to a duplicate of your production source with a name that clearly indicates it’s for testing purposes only. In this test source, crawl only a small subset of content for faster debugging and to limit the log file size. Only after fully testing and validating the pre-push extension in the test source should you apply it to your production source. |
After you apply this extension, rebuild the source.
# Import required Python libraries. Note: Add non-Python standard libraries to the requirements.txt file.
import os
import base64
import zlib
import logging
from logging.handlers import TimedRotatingFileHandler
# Initialize rotating file logging
log_folder = os.path.join(
os.getenv("COVEO_LOGS_ROOT"),
"Extensions",
os.getenv("SOURCE_ID", "unknown")
)
os.makedirs(log_folder, exist_ok=True)
fname = f"{os.getenv('OPERATION_TYPE','unknown')}_{os.getenv('OPERATION_ID','unknown')}.log"
fpath = os.path.join(log_folder, fname)
handler = TimedRotatingFileHandler(fpath, when="midnight")
handler.suffix = "%Y-%m-%d"
formatter = logging.Formatter(
fmt="%(asctime)s.%(msecs)03d %(levelname)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
handler.setFormatter(formatter)
logging.basicConfig(level=logging.INFO, handlers=[handler])
# -----------------------------------------------------------------
# Extension entry point. The do_extension function must be defined.
# -----------------------------------------------------------------
def do_extension(body):
# Log basic item info
document_id = body.get("DocumentId", "<missing>")
logging.info("BEGIN processing item: %s", document_id)
full_path = "C:/Data/sample.pdf"
# File existence check
if os.path.isfile(full_path):
logging.info("File found: %s", full_path)
try:
# Open and read the file as a binary (`rb`)
with open(full_path, "rb") as f:
file_data = f.read()
except Exception as ex:
logging.error("Failed to read file '%s': %s", full_path, ex)
return body
# Empty file check
if len(file_data) > 0:
logging.info("Read %d bytes from %s", len(file_data), full_path)
try:
# Compress and encode the file content using `zlib` and `base64` modules
compressed = zlib.compress(file_data)
encoded = base64.b64encode(compressed).decode()
body["CompressionType"] = "ZLIB"
body["CompressedBinaryData"] = encoded
logging.info("Compressed and encoded file. Original size: %d bytes, Compressed size: %d bytes",
len(file_data), len(compressed))
except Exception as ex:
logging.error("Error during compression/encoding for '%s': %s", full_path, ex)
else:
logging.warning("file_data is empty for document: %s", full_path)
else:
logging.warning("File not found: %s", full_path)
logging.info("END processing document: %s", document_id)
return body