Skip to content

Commit 721f0f8

Browse files
authored
Merge pull request #36 from cuappdev/fix-duplicate-games
Fix duplicate games and fetching logic
2 parents bd14659 + 756ec32 commit 721f0f8

6 files changed

Lines changed: 249 additions & 13 deletions

File tree

src/database.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ def keep_connection_alive():
4949
# Access the database
5050
db = client[os.getenv("MONGO_DB", "score_db")]
5151

52-
5352
def setup_database_indexes():
5453
"""Set up MongoDB indexes for optimal query performance"""
5554
try:
@@ -65,6 +64,31 @@ def setup_database_indexes():
6564

6665
# Index for sorting operations
6766
game_collection.create_index([("date", -1)], background=True)
67+
68+
# Index to have unique games so we won't add duplicates
69+
game_collection.create_index(
70+
[
71+
("sport", 1),
72+
("gender", 1),
73+
("date", 1),
74+
("opponent_id", 1),
75+
("state", 1),
76+
],
77+
unique=True,
78+
background=True
79+
)
80+
81+
# Additional index for tournament games (without opponent_id)
82+
game_collection.create_index(
83+
[
84+
("sport", 1),
85+
("gender", 1),
86+
("date", 1),
87+
("city", 1),
88+
("state", 1),
89+
],
90+
background=True
91+
)
6892

6993
print("✅ MongoDB indexes created successfully")
7094
except Exception as e:

src/repositories/game_repository.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,56 @@ def find_by_key_fields(city, date, gender, location, opponent_id, sport, state):
130130

131131
return [Game.from_dict(game) for game in games]
132132

133+
@staticmethod
134+
def find_by_tournament_key_fields(city, date, gender, location, sport, state):
135+
"""
136+
Find tournament games by location and date (excluding opponent_id).
137+
This is used when we need to find a tournament game that might have a placeholder team.
138+
Uses flexible matching to handle TBD/TBA values.
139+
"""
140+
game_collection = db["game"]
141+
142+
# Build flexible query that can handle TBD/TBA values
143+
query = {
144+
"date": date,
145+
"gender": gender,
146+
"sport": sport,
147+
}
148+
149+
# For city, state, and location, use flexible matching
150+
# This allows finding games even when TBD/TBA values change to real values
151+
city_conditions = []
152+
if city:
153+
city_conditions.append(city)
154+
else:
155+
city_conditions = [None]
156+
157+
state_conditions = []
158+
if state:
159+
state_conditions.append(state)
160+
else:
161+
state_conditions = [None]
162+
163+
location_conditions = []
164+
if location:
165+
location_conditions.append(location)
166+
else:
167+
location_conditions = [None]
168+
169+
query["city"] = {"$in": city_conditions}
170+
query["state"] = {"$in": state_conditions}
171+
query["location"] = {"$in": location_conditions}
172+
173+
games = list(game_collection.find(query))
174+
175+
if not games:
176+
return None
177+
178+
if len(games) == 1:
179+
return Game.from_dict(games[0])
180+
181+
return [Game.from_dict(game) for game in games]
182+
133183
@staticmethod
134184
def find_by_sport(sport):
135185
"""
@@ -156,3 +206,31 @@ def find_by_sport_gender(sport, gender):
156206
game_collection = db["game"]
157207
games = game_collection.find({"sport": sport, "gender": gender})
158208
return [Game.from_dict(game) for game in games]
209+
210+
@staticmethod
211+
def find_games_by_sport_gender_after_date(sport, gender, after_date=None):
212+
"""
213+
Find games for a specific sport and gender, optionally after a specific date.
214+
This method returns raw game data without team information.
215+
"""
216+
game_collection = db["game"]
217+
218+
query = {
219+
"sport": sport,
220+
"gender": gender
221+
}
222+
223+
if after_date:
224+
query["utc_date"] = {"$gt": after_date}
225+
226+
games = game_collection.find(query)
227+
return [Game.from_dict(game) for game in games]
228+
229+
@staticmethod
230+
def delete_games_by_ids(game_ids):
231+
"""
232+
Delete games by their IDs.
233+
"""
234+
game_collection = db["game"]
235+
result = game_collection.delete_many({"_id": {"$in": game_ids}})
236+
return result.deleted_count

src/scrapers/games_scraper.py

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
from src.utils.convert_to_utc import convert_to_utc
55
from src.utils.constants import *
66
from src.scrapers.game_details_scrape import scrape_game
7-
from src.utils.helpers import get_dominant_color
7+
from src.utils.helpers import get_dominant_color, normalize_game_data, is_tournament_placeholder_team, is_cornell_loss
88
import base64
99
import re
10-
import html
10+
from src.database import db
1111
import threading
1212

1313

@@ -164,6 +164,8 @@ def process_game_data(game_data):
164164
Args:
165165
game_data (dict): A dictionary containing the game data.
166166
"""
167+
168+
game_data = normalize_game_data(game_data)
167169
location_data = game_data["location"].split("\n")
168170
geo_location = location_data[0]
169171
if (",") not in geo_location:
@@ -232,16 +234,28 @@ def process_game_data(game_data):
232234
if str(final_box_cor_score) != str(cor_final) or str(final_box_opp_score) != str(opp_final):
233235
game_data["score_breakdown"] = game_data["score_breakdown"][::-1]
234236

235-
# finds any existing game with the same key fields regardless of time
236-
curr_game = GameService.get_game_by_key_fields(
237+
# Try to find by tournament key fields to handle placeholder teams
238+
curr_game = GameService.get_game_by_tournament_key_fields(
237239
city,
238240
game_data["date"],
239241
game_data["gender"],
240242
location,
241-
team.id,
242243
game_data["sport"],
243244
state
244245
)
246+
247+
# If no tournament game found, try the regular lookup with opponent_id
248+
if not curr_game:
249+
curr_game = GameService.get_game_by_key_fields(
250+
city,
251+
game_data["date"],
252+
game_data["gender"],
253+
location,
254+
team.id,
255+
game_data["sport"],
256+
state
257+
)
258+
245259
if isinstance(curr_game, list):
246260
if curr_game:
247261
curr_game = curr_game[0]
@@ -253,8 +267,19 @@ def process_game_data(game_data):
253267
"result": game_data["result"],
254268
"box_score": game_data["box_score"],
255269
"score_breakdown": game_data["score_breakdown"],
256-
"utc_date": utc_date_str
270+
"utc_date": utc_date_str,
271+
"city": city,
272+
"location": location,
273+
"state": state
257274
}
275+
276+
current_team = TeamService.get_team_by_id(curr_game.opponent_id)
277+
if current_team and is_tournament_placeholder_team(current_team.name):
278+
updates["opponent_id"] = team.id
279+
280+
if is_cornell_loss(game_data["result"]) and game_data["utc_date"]:
281+
GameService.handle_tournament_loss(game_data["sport"], game_data["gender"], game_data["utc_date"])
282+
258283
GameService.update_game(curr_game.id, updates)
259284
return
260285

@@ -272,5 +297,5 @@ def process_game_data(game_data):
272297
"score_breakdown": game_data["score_breakdown"],
273298
"utc_date": utc_date_str
274299
}
275-
300+
276301
GameService.create_game(game_data)

src/services/game_service.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from src.repositories.game_repository import GameRepository
22
from src.models.game import Game
33
from src.services.team_service import TeamService
4+
from src.utils.helpers import is_tournament_placeholder_team
45

56

67
class GameService:
@@ -33,6 +34,7 @@ def create_game(data):
3334
opponent_id = data.get("opponent_id")
3435
if not TeamService.get_team_by_id(opponent_id):
3536
raise ValueError(f"Opponent team with id {opponent_id} does not exist.")
37+
3638
game = Game(**data)
3739
GameRepository.insert(game)
3840
return game
@@ -69,6 +71,16 @@ def get_game_by_key_fields(city, date, gender, location, opponent_id, sport, sta
6971
city, date, gender, location, opponent_id, sport, state
7072
)
7173

74+
@staticmethod
75+
def get_game_by_tournament_key_fields(city, date, gender, location, sport, state):
76+
"""
77+
Retrieve a tournament game by location and date (excluding opponent_id).
78+
This is used when we need to find a tournament game that might have a placeholder team.
79+
"""
80+
return GameRepository.find_by_tournament_key_fields(
81+
city, date, gender, location, sport, state
82+
)
83+
7284
@staticmethod
7385
def get_games_by_sport(sport):
7486
"""
@@ -89,3 +101,50 @@ def get_games_by_sport_gender(sport, gender):
89101
Retrieves all game by its sport and gender.
90102
"""
91103
return GameRepository.find_by_sport_gender(sport, gender)
104+
105+
@staticmethod
106+
def get_tournament_games_by_sport_gender(sport, gender, after_date=None):
107+
"""
108+
Find tournament games (with placeholder team names) for a specific sport and gender.
109+
Optionally filter by games after a specific date.
110+
"""
111+
games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date)
112+
tournament_games = []
113+
114+
for game in games:
115+
team = TeamService.get_team_by_id(game.opponent_id)
116+
if team and is_tournament_placeholder_team(team.name):
117+
tournament_games.append(game)
118+
119+
return tournament_games
120+
121+
@staticmethod
122+
def delete_tournament_games_by_sport_gender(sport, gender, after_date=None):
123+
"""
124+
Delete tournament games (with placeholder team names) for a specific sport and gender.
125+
Optionally filter by games after a specific date.
126+
"""
127+
games = GameRepository.find_games_by_sport_gender_after_date(sport, gender, after_date)
128+
tournament_game_ids = []
129+
130+
for game in games:
131+
team = TeamService.get_team_by_id(game.opponent_id)
132+
if team and is_tournament_placeholder_team(team.name):
133+
tournament_game_ids.append(game.id)
134+
135+
if tournament_game_ids:
136+
return GameRepository.delete_games_by_ids(tournament_game_ids)
137+
return 0
138+
139+
@staticmethod
140+
def handle_tournament_loss(sport, gender, loss_date):
141+
"""
142+
Handle when a Cornell team loses in a tournament by deleting future tournament games.
143+
144+
Args:
145+
sport (str): The sport of the team that lost
146+
gender (str): The gender of the team that lost
147+
loss_date (datetime): The date when the team lost
148+
"""
149+
deleted_count = GameService.delete_tournament_games_by_sport_gender(sport, gender, loss_date)
150+
return deleted_count

src/services/team_service.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from src.repositories import TeamRepository
22
from src.models.team import Team
33

4-
54
class TeamService:
65
@staticmethod
76
def get_all_teams():
@@ -13,14 +12,25 @@ def get_all_teams():
1312
@staticmethod
1413
def create_team(team_data):
1514
"""
16-
Create a new team.
17-
15+
Create a new team, or update it if it already exists.
16+
1817
Args:
1918
team_data (dict): The data for the new team.
20-
2119
Returns:
2220
Team: The created team.
2321
"""
22+
name = team_data.get("name")
23+
if not name:
24+
raise ValueError("Team name is required to create a team.")
25+
26+
existing = TeamService.get_team_by_name(name)
27+
if existing:
28+
if isinstance(existing, list) and existing:
29+
existing = existing[0]
30+
31+
TeamService.update_team(existing.id, team_data)
32+
return existing
33+
2434
team = Team(**team_data)
2535
TeamRepository.insert(team)
2636
return team

src/utils/helpers.py

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,44 @@ def get_dominant_color(image_url, white_threshold=200, black_threshold=50):
5454
return hex_color
5555
except Exception as e:
5656
logging.error(f"Error in get_dominant_color for {image_url}: {e}")
57-
return default_color
57+
return default_color
58+
59+
def normalize_game_data(data: dict):
60+
"""
61+
Normalize placeholder values like TBA/TBD into None.
62+
"""
63+
placeholders = {"TBA", "TBD", "tba", "tbd"}
64+
65+
for field in ["time", "city", "state"]:
66+
if data.get(field) in placeholders:
67+
data[field] = None
68+
69+
return data
70+
71+
def is_tournament_placeholder_team(team_name: str):
72+
"""
73+
Check if a team name is a tournament placeholder.
74+
"""
75+
76+
placeholder_team_names = [
77+
"First Round", "Second Round", "Third Round", "Quarterfinals",
78+
"College Cup Semifinals", "College Cup Championship Game",
79+
"ECAC Hockey First Round", "ECAC Hockey Quarterfinals",
80+
"ECAC Hockey Semifinals", "ECAC Hockey Championship Game",
81+
"Regional Semifinals", "Regional Championship", "National Semifinals",
82+
"TBD", "National Championship", "NCAA Wrestling Championships", "NCAA Northeast Regional CHampionships",
83+
"NCAA Cross Country Championships",
84+
]
85+
return team_name in placeholder_team_names
86+
87+
def is_cornell_loss(result: str):
88+
"""
89+
Check if the result indicates a Cornell loss.
90+
"""
91+
92+
if not result:
93+
return False
94+
95+
# Common loss indicators in result strings
96+
loss_indicators = ["L", "Loss", "loss", "Defeated", "defeated"]
97+
return any(indicator in result for indicator in loss_indicators)

0 commit comments

Comments
 (0)