Solution for fastest way to insert 800 million data into postgres database
is Given Below:
I have a script that read some files one by one, clean it and insert it ino postgres database.
i tried to use python multi-processing using Pools but actually i found that CPU usage still reach sometimes to 30% and return most of time to 6%. so it’s really so slow.
Any suggestion for speeding it up ?
thank you
import os
import multiprocessing
path="data/"
arr = os.listdir(path)
connection = psycopg2.connect(
user="postgres", password="blabla", host="127.0.0.1", port="5432", database="test"
)
cursor = connection.cursor()
postgres_insert_query = """ INSERT INTO mobile (data1, data2) VALUES (%s,%s)
ON CONFLICT (data1)
DO
UPDATE SET data2 = EXCLUDED.data1 ;"""
def insert_data(key,record_to_insert, item):
print(key)
try:
cursor.executemany(postgres_insert_query, record_to_insert)
connection.commit()
count = cursor.rowcount
print(count, "Record inserted successfully into mobile table", item)
except (Exception, psycopg2.Error) as error:
print("Failed to insert record into mobile table", error)
i = 1
def process_data(item):
print(item)
global i
records = []
i+=1
with open(path+item,'r') as file:
for line in file:
line = dataCleansing(line)
records.append((line+'-'+str(i),'data2-'+str(i)+line))
if len(records)==50000:
insert_data(i,records,item)
records=[]
insert_data(i,records,item)
records=[]
if __name__ == '__main__':
a_pool = multiprocessing.Pool(6)
result = a_pool.map(process_data, arr)