import sys
import json
import argparse
from pathlib import Path
from perplexity import Perplexity
client = Perplexity()
MAX_CONTENT_CHARS = 50000 # Truncate very large files to stay within token limits
def read_document(file_path: str) -> str:
"""Read a document file and return its text content."""
path = Path(file_path)
content = path.read_text(errors="replace")
if len(content) > MAX_CONTENT_CHARS:
content = content[:MAX_CONTENT_CHARS] + "\n\n[... truncated ...]"
return content
def ask_about_document(
file_path: str,
question: str,
use_web_search: bool = False,
conversation_history: list = None,
) -> dict:
"""Ask a question about a document's content."""
doc_content = read_document(file_path)
filename = Path(file_path).name
# Build the input with document content and question
full_input = (
f"Document: {filename}\n"
f"{'='*60}\n"
f"{doc_content}\n"
f"{'='*60}\n\n"
f"Question: {question}"
)
# Include conversation history for multi-turn
if conversation_history:
messages = conversation_history + [{"role": "user", "content": full_input}]
response = client.responses.create(
model="openai/gpt-5.4",
input=messages,
tools=[{"type": "web_search"}] if use_web_search else [],
instructions="Answer questions based on the provided document content. Be specific and cite sections when possible.",
)
else:
response = client.responses.create(
model="openai/gpt-5.4",
input=full_input,
tools=[{"type": "web_search"}] if use_web_search else [],
instructions="Answer questions based on the provided document content. Be specific and cite sections when possible.",
)
usage = response.usage
return {
"answer": response.output_text,
"model": response.model,
"tokens": {
"input": usage.input_tokens if usage else 0,
"output": usage.output_tokens if usage else 0,
},
}
def extract_structured_data(file_path: str, schema_name: str, schema: dict) -> dict:
"""Extract structured data from a document using a JSON schema."""
doc_content = read_document(file_path)
response = client.responses.create(
model="openai/gpt-5.4",
input=f"Extract the requested structured data from this document:\n\n{doc_content}",
response_format={
"type": "json_schema",
"json_schema": {"name": schema_name, "schema": schema},
},
)
return json.loads(response.output_text)
def interactive_session(file_path: str, use_web_search: bool = False):
"""Run an interactive Q&A session over a document."""
print(f"Document loaded: {file_path}")
print(f"Web search: {'enabled' if use_web_search else 'disabled'}")
print("Type 'quit' to exit.\n")
history = []
while True:
question = input("Question: ").strip()
if question.lower() in ("quit", "exit", "q"):
break
if not question:
continue
result = ask_about_document(file_path, question, use_web_search, history)
print(f"\nAnswer:\n{result['answer']}\n")
print(f"({result['tokens']['input']}+{result['tokens']['output']} tokens)\n")
# Add to conversation history for multi-turn
history.append({"role": "user", "content": question})
history.append({"role": "assistant", "content": result["answer"]})
def main():
parser = argparse.ArgumentParser(description="Document Q&A")
parser.add_argument("file", help="Path to the document file")
parser.add_argument("question", nargs="?", help="Question to ask")
parser.add_argument("--interactive", action="store_true", help="Interactive mode")
parser.add_argument("--web-search", action="store_true", help="Enable web search")
args = parser.parse_args()
if not Path(args.file).exists():
print(f"Error: File not found: {args.file}", file=sys.stderr)
sys.exit(1)
if args.interactive:
interactive_session(args.file, args.web_search)
elif args.question:
result = ask_about_document(args.file, args.question, args.web_search)
print(result["answer"])
else:
print("Error: Provide a question or use --interactive.", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()