comercioeletronico-recomend.../002-normalize.py

230 lines
5.8 KiB
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import someUtils
import sqlite3
print('preparing envronment')
someUtils.rmf('database.db-journal')
someUtils.rmf('database.db')
someUtils.rmf('database_step2.db')
someUtils.cp('database_step1.db','database.db')
print('dealing with data')
connection = sqlite3.connect('database.db')
cursor = connection.cursor()
print('>> add year to movies')
cursor.execute('ALTER TABLE movies ADD year int')
cursor.execute('update movies set year = cast(substr(title,-1,-4) as int)')
cursor.execute('update movies set title=substr(title, 0, length(title)-6) where year<>0')
print('>> normalize genres')
genres = set()
for row in cursor.execute('select distinct genres from movies where genres<>"(no genres listed)" and genres is not null'):
genres = genres.union(set(row[0].split('|')))
del row
genres = list(sorted(genres))
cursor.execute('''
CREATE TABLE genres (
genreId serial not null,
genre varchar(255),
primary key (genreId)
);''')
genreAdd = []
for genre in genres:
genreAdd.append([len(genreAdd)+1,genre])
del genre
cursor.executemany('insert into genres(genreId,genre) values (?,?)',genreAdd)
del genreAdd
cursor.execute('''
CREATE TABLE moviegenre (
movieId int,
genreId int,
primary key (movieId, genreId),
foreign key (movieId) references movies(movieId),
foreign key (genreId) references genres(genreId)
);''')
cursor.execute('''
insert into
moviegenre(movieId,genreId)
select
movies.movieId,
genres.genreId
from
movies
left join
genres
where
'|' || movies.genres || '|'
like
'%|'||genres.genre||'|%'
''')
# SQLite não suporta "ALTER TABLE tbl DROP COLUMN col"
cursor.execute('''
CREATE TABLE movies2 (
movieId serial,
title varchar(255),
year int,
primary key (movieId)
);''')
cursor.execute('''
insert into
movies2(movieId,title,year)
select
movieId,
title,
year
from
movies
''')
cursor.execute('''drop table movies''')
cursor.execute('''
CREATE TABLE movies (
movieId serial,
title varchar(255),
year int,
primary key (movieId)
);''')
cursor.execute('''
insert into
movies(movieId,title,year)
select
movieId,
title,
year
from
movies2
''')
cursor.execute('''drop table movies2''')
print('>> normalize tags')
cursor.execute('update tags set tag=lower(tag)')
cursor.execute('alter table tags rename to movietag2')
cursor.execute('''
CREATE TABLE tags (
tagId serial,
tag varchar(255),
primary key (tagId)
);''')
cursor.execute('''
CREATE TABLE movietag (
userId int,
movieId int,
tagId int,
timestamp int,
foreign key (movieId) references movies(movieId)
foreign key (tagId) references tags(tagId)
);''')
tags = list()
for row in cursor.execute('select distinct tag from movietag2 order by tag'):
tags.append(row[0])
del row
tags = list(sorted(tags))
tagAdd = []
for tag in tags:
tagAdd.append([len(tagAdd)+1,tag])
del tag
cursor.executemany('insert into tags(tagId,tag) values (?,?)',tagAdd)
del tagAdd
cursor.execute('''
insert into movietag(userId,movieId,tagId,timestamp)
select
movietag2.userId,
movietag2.movieId,
tags.tagId,
movietag2.timestamp
from
movietag2
inner join
tags
on (movietag2.tag = tags.tag)
''')
cursor.execute('drop table movietag2')
print('>> preprocessing')
import libRecomendacao
cursor.execute('''CREATE TABLE preprocessadoUsuario (
userIdMaior int,
userIdMenor int,
similaridade float
)''')
# cursor.execute('''CREATE TABLE preprocessadoFilme (
# movieIdMaior int,
# movieIdMenor int,
# similaridade float
# )''')
cursor.execute('select movieId from movies order by movieId desc limit 1')
totalFilmes = cursor.fetchone()[0]
cursor.execute('select distinct userId from ratings order by userId desc limit 1')
totalUsuarios = cursor.fetchone()[0]
matrizNotas = [[None for x in range(totalFilmes+1)] for y in range(totalUsuarios+1)]
# matrizFilmes = [[None for x in range(totalUsuarios+1)] for y in range(totalFilmes+1)]
print(totalUsuarios, totalFilmes)
cursor.execute('select userId, movieId, rating from ratings')
for rowRating in cursor.fetchall():
matrizNotas[rowRating[0]][rowRating[1]] = rowRating[2]
# matrizFilmes[rowRating[1]][rowRating[0]] = rowRating[2]
# print('>>>> Filmes')
# allNull = []
# for filmeMaior in range(totalFilmes+1):
# if all(map(lambda a: a is None,matrizFilmes[filmeMaior])):
# allNull.append(filmeMaior)
# continue
# for filmeMenor in range(filmeMaior):
# if filmeMenor in allNull:
# continue
# sim = libRecomendacao.similaridade(
# matrizFilmes[filmeMaior],
# matrizFilmes[filmeMenor]
# )
# if sim is None:
# continue
# cursor.execute(
# 'insert into preprocessadoFilme(movieIdMaior,movieIdMenor,similaridade) values (?,?,?)',
# (
# filmeMaior,
# filmeMenor,
# sim
# )
# )
print('>>>> Usuários')
allNull = []
for usuarioMaior in range(totalUsuarios+1):
if all(map(lambda a: a is None,matrizNotas[usuarioMaior])):
allNull.append(usuarioMaior)
continue
print(usuarioMaior)
for usuarioMenor in range(usuarioMaior):
if usuarioMenor in allNull:
continue
sim = libRecomendacao.similaridade(
matrizNotas[usuarioMaior],
matrizNotas[usuarioMenor]
)
if sim is None:
continue
ss = (
usuarioMaior,
usuarioMenor,
sim
)
cursor.execute(
'insert into preprocessadoUsuario(userIdMaior,userIdMenor,similaridade) values (?,?,?)',
ss
)
pass
cursor.close()
connection.commit()
connection.close()
someUtils.mv('database.db','database_step2.db')