Code development platform for open source projects from the European Union institutions

Skip to content
Snippets Groups Projects
Commit 54ede653 authored by nasredine's avatar nasredine
Browse files

preliminary results of prompt engineering

parents
No related branches found
No related tags found
No related merge requests found
.env
myenv/*
data/*
akn4eu_generation_open_llms.ipynb
import requests
import json
import time
import os
def call_api(input_text, token, model_name, max_new_tokens):
start_time = time.time() # Capture start time
url = "https://api.deepinfra.com/v1/inference/" + model_name
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer " + token
}
data = {
"input": input_text,
"max_new_tokens": max_new_tokens
}
response = requests.post(url, headers=headers, data=json.dumps(data))
end_time = time.time() # Capture end time
execution_time = end_time - start_time # Calculate execution time
return response.json(), execution_time
def save_result_to_file(model_name, result, execution_time):
safe_model_name = model_name.replace("/", "_")
with open(f'{safe_model_name}.json', 'w') as file:
result_with_time = {
"result": result,
"execution_time": execution_time
}
json.dump(result_with_time, file)
def extract_and_save_xml(model_name, base_path='data/responses', output_base_path='data/xml'):
# Replace slashes with underscores to match the file naming convention for filenames
safe_model_name = model_name.replace("/", "_")
input_filename = os.path.join(base_path, f'{safe_model_name}.json')
output_filename = os.path.join(output_base_path, f'{safe_model_name}.xml')
try:
with open(input_filename, 'r') as file:
content = json.load(file)
generated_text = content["result"]["results"][0]["generated_text"]
# Ensure the output directory exists
os.makedirs(output_base_path, exist_ok=True)
# Save the extracted XML to a new file
with open(output_filename, 'w') as file:
file.write(generated_text)
print(f"XML saved to {output_filename}")
except FileNotFoundError:
print(f"File not found: {input_filename}")
except KeyError:
print(f"Invalid content format in {input_filename}")
#######################
# Example usage
input_text = open('/home/nasredine/dev/work/ai4xml/playground/data/prompts/text_xml_translation.prompt', 'r').read()
# Token (assuming the same token can be used for all models for simplicity)
token = "CtFHbaHNwqVaoj5N8mdcBTgqa8YbKMDC"
# List of models with their corresponding max_new_tokens values
models = [
{"name": "mistralai/Mixtral-8x7B-Instruct-v0.1", "max_new_tokens": 30000},
# {"name": "mistralai/Mistral-7B-Instruct-v0.1", "max_new_tokens": 30000},
# {"name": "meta-llama/Llama-2-7b-chat-hf", "max_new_tokens": 2000},
# {"name": "meta-llama/Llama-2-70b-chat-hf", "max_new_tokens": 2000},
# {"name": "codellama/CodeLlama-34b-Instruct-hf", "max_new_tokens": 10000},
# {"name": "bigcode/starcoder", "max_new_tokens": 30000},
# {"name": "Salesforce/codegen-16B-mono", "max_new_tokens": 10000},
]
for model in models:
model_name = model["name"]
max_new_tokens = model["max_new_tokens"]
result, execution_time = call_api(input_text, token, model_name, max_new_tokens)
# Save the result and execution time to a file named after the model
save_result_to_file(model_name, result, execution_time)
print(f"Results and execution time for {model_name} saved.")
for model in models:
model_name = model["name"]
xml_result = extract_and_save_xml(model_name)
if xml_result:
print(f"XML result for {model_name}:")
print(xml_result)
# Here you can further process the XML result or save it as needed
else:
print(f"No XML result available for {model_name}")
This diff is collapsed.
This diff is collapsed.
%% Cell type:code id: tags:
``` python
!pip install langchain
!pip install python-dotenv
!pip install langchain-fireworks
```
%% Cell type:code id: tags:
``` python
import os
from dotenv import load_dotenv
from langchain.chains import LLMChain
from langchain_core.output_parsers import StrOutputParser
from langchain_fireworks import Fireworks
load_dotenv()
```
%% Output
True
%% Cell type:code id: tags:
``` python
api_key = os.getenv("FIREWORKS_API_KEY")
```
%% Cell type:code id: tags:
``` python
MAX_TOKENS = 4000
```
%% Cell type:code id: tags:
``` python
models = [
'accounts/fireworks/models/starcoder-7b',
'accounts/fireworks/models/starcoder-16b',
'accounts/fireworks/models/llama-v2-13b-code-instruct',
'accounts/fireworks/models/llama-v2-34b-code-instruct',
'accounts/fireworks/models/llama-v2-70b-code-instruct',
]
```
%% Cell type:code id: tags:
``` python
shots = [0,1] # Add more shot numbers as needed
for shot in shots:
base_path = f'data/prompts/{shot}-shot'
prompt = open(f'{base_path}/prompt.txt', 'r').read()
for model in models:
model_name = model.split('/')[-1]
print(f'Processing model: {model_name}')
llm = Fireworks(
fireworks_api_key=api_key,
model=model,
max_tokens=MAX_TOKENS)
result = llm.invoke(prompt)
file_path = f'{base_path}/results/{model_name}.fireworks.ai.txt'
with open(file_path, 'w') as file:
file.write(result)
```
%% Output
Processing model: starcoder-7b
Processing model: starcoder-16b
Processing model: llama-v2-13b-code-instruct
Processing model: llama-v2-34b-code-instruct
Processing model: llama-v2-70b-code-instruct
%% Cell type:markdown id: tags:
### Install necessary packages
Langchain supports many LLM inference providers, including Fireworks.
%% Cell type:code id: tags:
``` python
!pip install langchain
!pip install python-dotenv
```
%% Cell type:code id: tags:
``` python
import os
from dotenv import load_dotenv
import requests
import json
load_dotenv()
```
%% Output
True
%% Cell type:markdown id: tags:
### API KEY
* register and get api key from : https://fireworks.ai/api-keys
* put the key in the file .env file in FIREWORKS_API_KEY variable
%% Cell type:code id: tags:
``` python
api_key = os.getenv("FIREWORKS_API_KEY")
```
%% Cell type:code id: tags:
``` python
# maximum number of tokens to generate by the model
max_tokens = {}
max_tokens[0] = 1000
max_tokens[1] = 1000
max_tokens[2] = 2000
```
%% Cell type:markdown id: tags:
### Prompting Models
%% Cell type:code id: tags:
``` python
models = [
'accounts/fireworks/models/starcoder-7b',
'accounts/fireworks/models/starcoder-16b',
'accounts/fireworks/models/llama-v2-13b-code-instruct',
'accounts/fireworks/models/llama-v2-34b-code-instruct',
'accounts/fireworks/models/llama-v2-70b-code-instruct',
'accounts/fireworks/models/mixtral-8x7b-instruct',
]
```
%% Cell type:code id: tags:
``` python
def send_fireworks_chat_request(model,messages, api_key, max_tokens=4096,
temperature=0.6, top_p=1,top_k=40,
frequency_penalty=0,
presence_penalty=0,
):
url = "https://api.fireworks.ai/inference/v1/chat/completions"
payload = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"frequency_penalty": frequency_penalty,
"presence_penalty": presence_penalty,
"top_k": top_k,
}
headers = {
"Accept": "application/json",
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
response = requests.post(url, json=payload, headers=headers)
return response.json()
```
%% Cell type:code id: tags:
``` python
system_message = """
You are AKN4EU XML formation converter.
You receive plain text documents and convert them to XML.
Return only the final XML document converted from the text.
The user gives some examples of XML documents.
"""
assistant_message = """
EUROPEAN COMMISSION
Brussels, 21.12.2016
2016/0411 (COD)
Proposal for a
REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE
COUNCIL
amending Regulation (EC)
No 1008/2008 on common rules for the operation of air services in the
Community
EN
"""
user_message_1 = """
Convert plain text of following coverpage to AKN4EU XML format.
EUROPEAN COMMISSION
Brussels, 21.12.2016
2016/0411 (COD)
Proposal for a
REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE
COUNCIL
amending Regulation (EC)
No 1008/2008 on common rules for the operation of air services in the
Community
EN
"""
assistant_message = """
<coverPage>
<container name="logo">
<p><img src="EC.png" alt="EUROPEAN COMMISSION"/></p>
</container>
<container name="actingEntity">
<p><organization refersTo="~_COM">EUROPEAN COMMISSION</organization></p>
</container>
<container name="mainDoc">
<block name="placeAndDate">
<location refersTo="~_BEL_BRU">Brussels</location>, <date date="2016-12-21">21.12.2016</date>
</block>
</container>
<container name="procedureIdentifier">
<p><docketNumber refersTo="~_procedure_2016_411">2016/0411 (COD)</docketNumber></p>
</container>
<longTitle>
<p><docStage>Proposal for a</docStage>
<docType refersTo="~_REG">REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE
COUNCIL</docType>
<docPurpose>amending <ref href="http://data.europa.eu/eli/reg/2008/1008">Regulation (EC)
No 1008/2008 on common rules for the operation of air services in the
Community</ref></docPurpose></p>
</longTitle>
<container name="mainDocLanguage">
<p><inline name="language" refersTo="~_FRBRlanguage">EN</inline></p>
</container>
</coverPage>
"""
user_message_2 = """
Convert plain text of following coverpage to AKN4EU XML format.
EUROPEAN COMMISSION
Brussels, 21.12.2017
2012/0412 (COD)
Proposal for a
REGULATION OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL amending Regulation (EC)
No 1009/2009 on common rules for the operation of air services in the Community
EN
"""
messages = [
{"role": "system", "content": system_message},
{"role": "user", "content": user_message_1},
{"role": "assistant", "content": assistant_message},
{"role": "user", "content": user_message_2},
]
```
%% Cell type:code id: tags:
``` python
response
```
%% Output
{'error': {'object': 'error',
'type': 'invalid_request_error',
'message': 'model is missing conversation_config, please use /completions API instead'}}
%% Cell type:code id: tags:
``` python
response = send_fireworks_chat_request("accounts/fireworks/models/starcoder-7b",messages, api_key)
text = response['choices'][0]['text']
print(text)
```
%% Output
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Cell In[52], line 2
1 response = send_fireworks_chat_request("accounts/fireworks/models/starcoder-7b",messages, api_key)
----> 2 text = response['choices'][0]['text']
4 print(text)
KeyError: 'choices'
%% Cell type:code id: tags:
``` python
shots = [0,1,2]
for shot in shots:
print(f'Processing shot: {shot}')
base_path = f'data/prompts/{shot}-shot'
prompt = open(f'{base_path}/prompt.txt', 'r').read()
for model in models:
model_name = model.split('/')[-1]
print(f'Processing model: {model_name}')
results_dir = f'{base_path}/results'
if not os.path.exists(results_dir):
os.makedirs(results_dir)
file_path = f'{results_dir}/{model_name}.fireworks.ai.txt'
# Check if the result file already exists
if os.path.exists(file_path):
print('Skipping...')
continue
llm = Fireworks(
fireworks_api_key=api_key,
model=model,
max_tokens=max_tokens[shot])
result = llm.invoke(prompt)
with open(file_path, 'w') as file:
file.write(result)
```
%% Cell type:markdown id: tags:
### XML Extraction from results
%% Cell type:code id: tags:
``` python
import os
import re
base_dir = 'data/prompts'
shots = [0, 1, 2]
for shot in shots:
results_path = os.path.join(base_dir, f'{shot}-shot', 'results')
results_xml_path = os.path.join(base_dir, f'{shot}-shot', 'results-xml')
# Ensure the results-xml directory exists
if not os.path.exists(results_xml_path):
os.makedirs(results_xml_path)
# Loop through each result file in the results directory
if os.path.exists(results_path) and os.path.isdir(results_path):
for result_file in os.listdir(results_path):
file_path = os.path.join(results_path, result_file)
if file_path.endswith('.fireworks.ai.txt'):
with open(file_path, 'r') as file:
result_content = file.read()
# Regular expression to find content enclosed by <coverPage>...</coverPage>
# This pattern ignores any text outside the XML tags
start_tag = "<coverPage>"
end_tag = "</coverPage>"
# Finding the last occurrence of the start_tag and the last occurrence of the end_tag
start = result_content.rfind(start_tag)
end = result_content.rfind(end_tag) + len(end_tag)
# If the start tag or end tag is not found, return an empty string or a specific message
if start == -1 or end == -1:
print(f"No XML content found in {result_file}")
xml_content = result_content[start:end]
# Prepares the filename and path for saving the extracted XML
xml_file_name = result_file.replace('.txt', '.xml')
xml_file_path = os.path.join(results_xml_path, xml_file_name)
# Writes the XML content to a new file in the results-xml directory
with open(xml_file_path, 'w') as xml_file:
xml_file.write(xml_content)
print(f'Extracted and saved XML for {xml_file_name}')
```
This diff is collapsed.
%% Cell type:markdown id: tags:
### Install necessary packages
Langchain supports many LLM inference providers, including Fireworks.
%% Cell type:code id: tags:
``` python
!pip install langchain
!pip install python-dotenv
```
%% Cell type:code id: tags:
``` python
import os
from dotenv import load_dotenv
import requests
import json
load_dotenv()
```
%% Output
True
%% Cell type:markdown id: tags:
### API KEY
* register and get api key from : https://fireworks.ai/api-keys
* put the key in the file .env file in FIREWORKS_API_KEY variable
%% Cell type:code id: tags:
``` python
api_key = os.getenv("FIREWORKS_API_KEY")
```
%% Cell type:code id: tags:
``` python
# maximum number of tokens to generate by the model
max_tokens = {}
max_tokens[0] = 1000
max_tokens[1] = 1000
max_tokens[2] = 2000
```
%% Cell type:markdown id: tags:
### Prompting Models
%% Cell type:code id: tags:
``` python
models = [
'accounts/fireworks/models/starcoder-7b',
'accounts/fireworks/models/starcoder-16b',
'accounts/fireworks/models/llama-v2-13b-code-instruct',
'accounts/fireworks/models/llama-v2-34b-code-instruct',
'accounts/fireworks/models/llama-v2-70b-code-instruct',
'accounts/fireworks/models/mixtral-8x7b-instruct',
]
```
%% Cell type:code id: tags:
``` python
import requests
import json
def send_fireworks_request(model, api_key, model_type='chat', prompt_or_messages=None, max_tokens=1024,
temperature=0.6, top_p=1, top_k=40,
frequency_penalty=0, presence_penalty=0):
# Base URL and payload setup
base_url = "https://api.fireworks.ai/inference/v1/"
payload = {
"model": model,
"max_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"top_k": top_k,
"presence_penalty": presence_penalty,
"frequency_penalty": frequency_penalty
}
# Configure specific parameters for chat or completion
if model_type == 'chat':
url = base_url + "chat/completions"
payload["messages"] = prompt_or_messages
elif model_type == 'completion':
url = base_url + "completions"
payload["prompt"] = prompt_or_messages
else:
raise ValueError("Unsupported model type. Choose 'chat' or 'completion'.")
headers = {
"Accept": "application/json",
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
response = requests.post(url, json=payload, headers=headers)
if response.status_code != 200:
return {"error": response.json()}
return response.json()
```
%% Cell type:code id: tags:
``` python
shots = [0,1,2]
for shot in shots:
print(f'Processing shot: {shot}')
base_path = f'data/prompts/{shot}-shot'
prompt = open(f'{base_path}/prompt.txt', 'r').read()
for model in models:
model_name = model.split('/')[-1]
print(f'Processing model: {model_name}')
results_dir = f'{base_path}/results'
if not os.path.exists(results_dir):
os.makedirs(results_dir)
file_path = f'{results_dir}/{model_name}.fireworks.ai.txt'
# Check if the result file already exists
if os.path.exists(file_path):
print('Skipping...')
continue
llm = Fireworks(
fireworks_api_key=api_key,
model=model,
max_tokens=max_tokens[shot])
result = llm.invoke(prompt)
with open(file_path, 'w') as file:
file.write(result)
```
%% Cell type:markdown id: tags:
### XML Extraction from results
%% Cell type:code id: tags:
``` python
import os
import re
base_dir = 'data/prompts'
shots = [0, 1, 2]
for shot in shots:
results_path = os.path.join(base_dir, f'{shot}-shot', 'results')
results_xml_path = os.path.join(base_dir, f'{shot}-shot', 'results-xml')
# Ensure the results-xml directory exists
if not os.path.exists(results_xml_path):
os.makedirs(results_xml_path)
# Loop through each result file in the results directory
if os.path.exists(results_path) and os.path.isdir(results_path):
for result_file in os.listdir(results_path):
file_path = os.path.join(results_path, result_file)
if file_path.endswith('.fireworks.ai.txt'):
with open(file_path, 'r') as file:
result_content = file.read()
# Regular expression to find content enclosed by <coverPage>...</coverPage>
# This pattern ignores any text outside the XML tags
start_tag = "<coverPage>"
end_tag = "</coverPage>"
# Finding the last occurrence of the start_tag and the last occurrence of the end_tag
start = result_content.rfind(start_tag)
end = result_content.rfind(end_tag) + len(end_tag)
# If the start tag or end tag is not found, return an empty string or a specific message
if start == -1 or end == -1:
print(f"No XML content found in {result_file}")
xml_content = result_content[start:end]
# Prepares the filename and path for saving the extracted XML
xml_file_name = result_file.replace('.txt', '.xml')
xml_file_path = os.path.join(results_xml_path, xml_file_name)
# Writes the XML content to a new file in the results-xml directory
with open(xml_file_path, 'w') as xml_file:
xml_file.write(xml_content)
print(f'Extracted and saved XML for {xml_file_name}')
```
%% Cell type:code id: tags:
``` python
import xml.etree.ElementTree as ET
import os
```
%% Cell type:code id: tags:
``` python
def validate_xml(xml_file):
try:
tree = ET.parse(xml_file)
return True, ""
except ET.ParseError as e:
return False, f"XML is not well-formed: {e}"
```
%% Cell type:code id: tags:
``` python
shots = [0,1,2]
for shot in shots:
print(f'Evaluating {shot}-shot')
print('='*20)
result_xml_dir = f'data/prompts/{shot}-shot/results-xml/'
xml_files = [f for f in os.listdir(result_xml_dir) if f.endswith('.xml')]
for xml_file in xml_files:
is_valid, message = validate_xml(f'{result_xml_dir}/{xml_file}')
print(f'{xml_file} is {is_valid}: {message}')
```
%% Output
Evaluating 0-shot
====================
zephyr-7b-alpha.gpt.grc.xml is True:
zephyr-7b-beta.gpt.grc.xml is True:
gpt-35-turbo-0301.gpt.grc.xml is True:
mistral-7b-openorca.xml is True:
gpt-4-32k.gpt.grc.xml is True:
llama-2-70b-chat.xml is False: XML is not well-formed: mismatched tag: line 12, column 8
gpt-35-turbo-16k.grc.xml is True:
gpt-4.grc.xml is True:
llama-2-13b-chat.xml is True:
Evaluating 1-shot
====================
starcoder-16b.fireworks.ai.xml is False: XML is not well-formed: syntax error: line 2, column 0
llama-v2-34b-code-instruct.fireworks.ai.xml is True:
llama-v2-13b-code-instruct.xml is True:
starcoder-7b.fireworks.ai.xml is False: XML is not well-formed: syntax error: line 2, column 0
mixtral-8x7b-instruct_fireworks.ai.xml is True:
llama-2-70b-chat.gpt.jrc.xml is True:
zephyr-7b-beta.gpt.jrc.xml is True:
nous-hermes-2-mixtral-8x7b-dpo-gpt.jrc.xml is True:
mistral-7b-openorca.gpt.jrc.xml is True:
zephyr-7b-alpha.gpt.jrc.xml is True:
Evaluating 2-shot
====================
starcoder-16b.fireworks.ai.xml is False: XML is not well-formed: not well-formed (invalid token): line 2, column 1
llama-v2-34b-code-instruct.fireworks.ai.xml is True:
llama-v2-13b-code-instruct.fireworks.ai.xml is True:
starcoder-7b.fireworks.ai.xml is False: XML is not well-formed: not well-formed (invalid token): line 60, column 51
mixtral-8x7b-instruct.fireworks.ai.xml is False: XML is not well-formed: mismatched tag: line 50, column 26
zephyr-7b-beta.gpt.jrc.xml is False: XML is not well-formed: junk after document element: line 39, column 0
nous-hermes-2-mixtral-8x7b-dpo-gpt.jrc.xml is True:
mistral-7b-openorca.gpt.jrc.xml is True:
zephyr-7b-alpha.gpt.jrc.xml is True:
%% Cell type:code id: tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment