I have a python script that I am using using to make sql queries. The problem is that my VM only has 2GBs of RAM and some of the sql queries are too RAM intensive and therefore the kernel automatically kills the script. How can I make this code more RAM efficient? I would like to implement pagination in my postgres sql code. How would I do that? Does anyone know an easy implementation of that? I would greatly appreciate your help!
Updated Code
from __future__ import print_function
try:
import psycopg2
except ImportError:
raise ImportError('\n\033[33mpsycopg2 library missing. pip install psycopg2\033[1;m\n')
sys.exit(1)
import re
import sys
import json
import pprint
import time
outfilepath = "crtsh_output/crtsh_flat_file"
DB_HOST = 'crt.sh'
DB_NAME = 'certwatch'
DB_USER = 'guest'
# DELAY = 0
def connect_to_db():
start = 0
offset = 10
flag = True
while flag:
filepath = 'forager.txt'
with open(filepath) as fp:
unique_domains = ''
try:
conn = psycopg2.connect("dbname={0} user={1} host={2}".format(DB_NAME, DB_USER, DB_HOST))
cursor = conn.cursor()
cursor.itersize = 10000
for cnt, domain_name in enumerate(fp):
print("Line {}: {}".format(cnt, domain_name))
print(domain_name)
domain_name = domain_name.rstrip()
cursor.execute('''SELECT c.id, x509_commonName(c.certificate), x509_issuerName(c.certificate), x509_notBefore(c.certificate), x509_notAfter(c.certificate), x509_issuerName(c.certificate), x509_keyAlgorithm(c.certificate), x509_keySize(c.certificate), x509_publicKeyMD5(c.certificate), x509_publicKey(c.certificate), x509_rsaModulus(c.certificate), x509_serialNumber(c.certificate), x509_signatureHashAlgorithm(c.certificate), x509_signatureKeyAlgorithm(c.certificate), x509_subjectName(c.certificate), x509_name(c.certificate), x509_name_print(c.certificate), x509_commonName(c.certificate), x509_subjectKeyIdentifier(c.certificate), x509_extKeyUsages(c.certificate), x509_certPolicies(c.certificate), x509_canIssueCerts(c.certificate), x509_getPathLenConstraint(c.certificate), x509_altNames(c.certificate), x509_altNames_raw(c.certificate), x509_cRLDistributionPoints(c.certificate), x509_authorityInfoAccess(c.certificate), x509_print(c.certificate), x509_anyNamesWithNULs(c.certificate), x509_extensions(c.certificate), x509_tbscert_strip_ct_ext(c.certificate), x509_hasROCAFingerprint(c.certificate)
FROM certificate c, certificate_identity ci WHERE
c.id= ci.certificate_id AND ci.name_type = 'dNSName' AND lower(ci.name_value) =
lower(%s) AND x509_notAfter(c.certificate) > statement_timestamp()''', (domain_name,))
# query db with start and offset
unique_domains = cursor.fetchall()
if not unique_domains:
flag = False
else:
# do processing with your data
pprint.pprint(unique_domains)
outfilepath = "crtsh2" + ".json"
with open(outfilepath, 'a') as outfile:
outfile.write(json.dumps(unique_domains, sort_keys=True, indent=4, default=str, ensure_ascii = False))
offset += limit
except Exception as error:
print(str(error))
if __name__ == "__main__":
connect_to_db()
cursor.fetchall()is called outside the loop - which means the database never gets a chance to close resources consumed by prior runs of the query. You should process query results for eachexecute()call - append to a list, update a set, etc.crtsh2.jsonon each iteration. For simplicity, manage both files at the same time viawith open(filepath) as fp, open('crtsh2.json') as outfile: